You are viewing a plain text version of this content. The canonical link for it is here.
Posted to by on 2016/07/05 22:48:57 UTC

[13/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
deleted file mode 100644
index 9f616fe..0000000
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
+++ /dev/null
@@ -1,587 +0,0 @@
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.http.api;
-// JDK imports
-import java.util.*;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.concurrent.ThreadLocalRandom;
-// Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.util.GZIPUtils;
-import org.apache.nutch.util.DeflateUtils;
-import org.apache.hadoop.util.StringUtils;
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-// crawler-commons imports
-import crawlercommons.robots.BaseRobotRules;
-public abstract class HttpBase implements Protocol {
-  public static final Text RESPONSE_TIME = new Text("_rs_");
-  public static final int BUFFER_SIZE = 8 * 1024;
-  private static final byte[] EMPTY_CONTENT = new byte[0];
-  private HttpRobotRulesParser robots = null;
-  private ArrayList<String> userAgentNames = null;
-  /** The proxy hostname. */
-  protected String proxyHost = null;
-  /** The proxy port. */
-  protected int proxyPort = 8080;
-  /** The proxy exception list. */
-  protected HashMap proxyException = new HashMap(); 
-  /** Indicates if a proxy is used */
-  protected boolean useProxy = false;
-  /** The network timeout in millisecond */
-  protected int timeout = 10000;
-  /** The length limit for downloaded content, in bytes. */
-  protected int maxContent = 64 * 1024;
-  /** The Nutch 'User-Agent' request header */
-  protected String userAgent = getAgentString("NutchCVS", null, "Nutch",
-      "", "");
-  /** The "Accept-Language" request header value. */
-  protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
-  /** The "Accept" request header value. */
-  protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
-  /** The default logger */
-  private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
-  /** The specified logger */
-  private Logger logger = LOGGER;
-  /** The nutch configuration */
-  private Configuration conf = null;
-  /** Do we use HTTP/1.1? */
-  protected boolean useHttp11 = false;
-  /**
-   * Record response time in CrawlDatum's meta data, see property
-   *
-   */
-  protected boolean responseTime = true;
-  /** Skip page if Crawl-Delay longer than this value. */
-  protected long maxCrawlDelay = -1L;
-  /** Which TLS/SSL protocols to support */
-  protected Set<String> tlsPreferredProtocols;
-  /** Which TLS/SSL cipher suites to support */
-  protected Set<String> tlsPreferredCipherSuites;
-  /** Configuration directive for If-Modified-Since HTTP header */
-  public boolean enableIfModifiedsinceHeader = true;
-  /** Creates a new instance of HttpBase */
-  public HttpBase() {
-    this(null);
-  }
-  /** Creates a new instance of HttpBase */
-  public HttpBase(Logger logger) {
-    if (logger != null) {
-      this.logger = logger;
-    }
-    robots = new HttpRobotRulesParser();
-  }
-  // Inherited Javadoc
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.proxyHost = conf.get("");
-    this.proxyPort = conf.getInt("http.proxy.port", 8080);
-    this.proxyException = arrayToMap(conf.getStrings("http.proxy.exception.list"));
-    this.useProxy = (proxyHost != null && proxyHost.length() > 0);
-    this.timeout = conf.getInt("http.timeout", 10000);
-    this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
-    this.userAgent = getAgentString(conf.get(""),
-        conf.get("http.agent.version"), conf.get("http.agent.description"),
-        conf.get("http.agent.url"), conf.get(""));
-    this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
-    this.accept = conf.get("http.accept", accept);
-    // backward-compatible default setting
-    this.useHttp11 = conf.getBoolean("http.useHttp11", false);
-    this.responseTime = conf.getBoolean("", true);
-    this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
-    this.robots.setConf(conf);
-    // NUTCH-1941: read list of alternating agent names
-    if (conf.getBoolean("http.agent.rotate", false)) {
-      String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
-      BufferedReader br = null;
-      try {
-        Reader reader = conf.getConfResourceAsReader(agentsFile);
-        br = new BufferedReader(reader);
-        userAgentNames = new ArrayList<String>();
-        String word = "";
-        while ((word = br.readLine()) != null) {
-          if (!word.trim().isEmpty())
-            userAgentNames.add(word.trim());
-        }
-        if (userAgentNames.size() == 0) {
-          logger.warn("Empty list of user agents in http.agent.rotate.file {}",
-              agentsFile);
-          userAgentNames = null;
-        }
-      } catch (Exception e) {
-        logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile,
-            StringUtils.stringifyException(e));
-        userAgentNames = null;
-      } finally {
-        if (br != null) {
-          try {
-            br.close();
-          } catch (IOException e) {
-            // ignore
-          }
-        }
-      }
-      if (userAgentNames == null) {
-        logger
-            .warn("Falling back to fixed user agent set via property");
-      }
-    }
-    String[] protocols = conf.getStrings("http.tls.supported.protocols",
-        "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
-    String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
-        "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
-        "TLS_RSA_WITH_AES_256_CBC_SHA256",
-        "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
-        "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
-        "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
-        "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256",
-        "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
-        "TLS_RSA_WITH_AES_128_CBC_SHA256",
-        "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
-        "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256",
-        "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
-        "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256",
-        "TLS_ECDH_RSA_WITH_RC4_128_SHA",
-        "TLS_KRB5_WITH_DES_CBC_MD5");
-    tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
-    tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
-    logConf();
-  }
-  // Inherited Javadoc
-  public Configuration getConf() {
-    return this.conf;
-  }
-  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
-    String urlString = url.toString();
-    try {
-      URL u = new URL(urlString);
-      long startTime = System.currentTimeMillis();
-      Response response = getResponse(u, datum, false); // make a request
-      if (this.responseTime) {
-        int elapsedTime = (int) (System.currentTimeMillis() - startTime);
-        datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
-      }
-      int code = response.getCode();
-      datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
-        new Text(Integer.toString(code)));
-      byte[] content = response.getContent();
-      Content c = new Content(u.toString(), u.toString(),
-          (content == null ? EMPTY_CONTENT : content),
-          response.getHeader("Content-Type"), response.getHeaders(), this.conf);
-      if (code == 200) { // got a good response
-        return new ProtocolOutput(c); // return it
-      } else if (code >= 300 && code < 400) { // handle redirect
-        String location = response.getHeader("Location");
-        // some broken servers, such as MS IIS, use lowercase header name...
-        if (location == null)
-          location = response.getHeader("location");
-        if (location == null)
-          location = "";
-        u = new URL(u, location);
-        int protocolStatusCode;
-        switch (code) {
-        case 300: // multiple choices, preferred value in Location
-          protocolStatusCode = ProtocolStatus.MOVED;
-          break;
-        case 301: // moved permanently
-        case 305: // use proxy (Location is URL of proxy)
-          protocolStatusCode = ProtocolStatus.MOVED;
-          break;
-        case 302: // found (temporarily moved)
-        case 303: // see other (redirect after POST)
-        case 307: // temporary redirect
-          protocolStatusCode = ProtocolStatus.TEMP_MOVED;
-          break;
-        case 304: // not modified
-          protocolStatusCode = ProtocolStatus.NOTMODIFIED;
-          break;
-        default:
-          protocolStatusCode = ProtocolStatus.MOVED;
-        }
-        // handle this in the higher layer.
-        return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
-      } else if (code == 400) { // bad request, mark as GONE
-        if (logger.isTraceEnabled()) {
-          logger.trace("400 Bad request: " + u);
-        }
-        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
-      } else if (code == 401) { // requires authorization, but no valid auth
-                                // provided.
-        if (logger.isTraceEnabled()) {
-          logger.trace("401 Authentication Required");
-        }
-        return new ProtocolOutput(c, new ProtocolStatus(
-            ProtocolStatus.ACCESS_DENIED, "Authentication required: "
-                + urlString));
-      } else if (code == 404) {
-        return new ProtocolOutput(c, new ProtocolStatus(
-            ProtocolStatus.NOTFOUND, u));
-      } else if (code == 410) { // permanently GONE
-        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
-            "Http: " + code + " url=" + u));
-      } else {
-        return new ProtocolOutput(c, new ProtocolStatus(
-            ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
-      }
-    } catch (Throwable e) {
-      logger.error("Failed to get protocol output", e);
-      return new ProtocolOutput(null, new ProtocolStatus(e));
-    }
-  }
-  /*
-   * -------------------------- * </implementation:Protocol> *
-   * --------------------------
-   */
-  public String getProxyHost() {
-    return proxyHost;
-  }
-  public int getProxyPort() {
-    return proxyPort;
-  }
-  public boolean useProxy(URL url) {
-    if (!useProxy){
-      return false;
-    } else if (proxyException.get(url.getHost())!=null){
-      return false;
-    }
-    return useProxy;
-  }
-  public int getTimeout() {
-    return timeout;
-  }
-  public boolean isIfModifiedSinceEnabled() {
-    return enableIfModifiedsinceHeader;
-  }
-  public int getMaxContent() {
-    return maxContent;
-  }
-  public String getUserAgent() {
-    if (userAgentNames!=null) {
-      return userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1));
-    }
-    return userAgent;
-  }
-  /**
-   * Value of "Accept-Language" request header sent by Nutch.
-   * 
-   * @return The value of the header "Accept-Language" header.
-   */
-  public String getAcceptLanguage() {
-    return acceptLanguage;
-  }
-  public String getAccept() {
-    return accept;
-  }
-  public boolean getUseHttp11() {
-    return useHttp11;
-  }
-  public Set<String> getTlsPreferredCipherSuites() {
-    return tlsPreferredCipherSuites;
-  }
-  public Set<String> getTlsPreferredProtocols() {
-    return tlsPreferredProtocols;
-  }
-  private static String getAgentString(String agentName, String agentVersion,
-      String agentDesc, String agentURL, String agentEmail) {
-    if ((agentName == null) || (agentName.trim().length() == 0)) {
-      // TODO : NUTCH-258
-      if (LOGGER.isErrorEnabled()) {
-        LOGGER.error("No User-Agent string set (!");
-      }
-    }
-    StringBuffer buf = new StringBuffer();
-    buf.append(agentName);
-    if (agentVersion != null) {
-      buf.append("/");
-      buf.append(agentVersion);
-    }
-    if (((agentDesc != null) && (agentDesc.length() != 0))
-        || ((agentEmail != null) && (agentEmail.length() != 0))
-        || ((agentURL != null) && (agentURL.length() != 0))) {
-      buf.append(" (");
-      if ((agentDesc != null) && (agentDesc.length() != 0)) {
-        buf.append(agentDesc);
-        if ((agentURL != null) || (agentEmail != null))
-          buf.append("; ");
-      }
-      if ((agentURL != null) && (agentURL.length() != 0)) {
-        buf.append(agentURL);
-        if (agentEmail != null)
-          buf.append("; ");
-      }
-      if ((agentEmail != null) && (agentEmail.length() != 0))
-        buf.append(agentEmail);
-      buf.append(")");
-    }
-    return buf.toString();
-  }
-  protected void logConf() {
-    if (logger.isInfoEnabled()) {
-" = " + proxyHost);
-"http.proxy.port = " + proxyPort);
-"http.proxy.exception.list = " + useProxy);
-"http.timeout = " + timeout);
-"http.content.limit = " + maxContent);
-"http.agent = " + userAgent);
-"http.accept.language = " + acceptLanguage);
-"http.accept = " + accept);
-    }
-  }
-  public byte[] processGzipEncoded(byte[] compressed, URL url)
-      throws IOException {
-    if (LOGGER.isTraceEnabled()) {
-      LOGGER.trace("uncompressing....");
-    }
-    // content can be empty (i.e. redirection) in which case
-    // there is nothing to unzip
-    if (compressed.length == 0)
-      return compressed;
-    byte[] content;
-    if (getMaxContent() >= 0) {
-      content = GZIPUtils.unzipBestEffort(compressed, getMaxContent());
-    } else {
-      content = GZIPUtils.unzipBestEffort(compressed);
-    }
-    if (content == null)
-      throw new IOException("unzipBestEffort returned null");
-    if (LOGGER.isTraceEnabled()) {
-      LOGGER.trace("fetched " + compressed.length
-          + " bytes of compressed content (expanded to " + content.length
-          + " bytes) from " + url);
-    }
-    return content;
-  }
-  public byte[] processDeflateEncoded(byte[] compressed, URL url)
-      throws IOException {
-    // content can be empty (i.e. redirection) in which case
-    // there is nothing to deflate
-    if (compressed.length == 0)
-      return compressed;
-    if (LOGGER.isTraceEnabled()) {
-      LOGGER.trace("inflating....");
-    }
-    byte[] content;
-    if (getMaxContent() >= 0) {
-      content = DeflateUtils.inflateBestEffort(compressed, getMaxContent());
-    } else {
-      content = DeflateUtils.inflateBestEffort(compressed);
-    }
-    if (content == null)
-      throw new IOException("inflateBestEffort returned null");
-    if (LOGGER.isTraceEnabled()) {
-      LOGGER.trace("fetched " + compressed.length
-          + " bytes of compressed content (expanded to " + content.length
-          + " bytes) from " + url);
-    }
-    return content;
-  }
-  protected static void main(HttpBase http, String[] args) throws Exception {
-    boolean verbose = false;
-    String url = null;
-    String usage = "Usage: Http [-verbose] [-timeout N] url";
-    if (args.length == 0) {
-      System.err.println(usage);
-      System.exit(-1);
-    }
-    for (int i = 0; i < args.length; i++) { // parse command line
-      if (args[i].equals("-timeout")) { // found -timeout option
-        http.timeout = Integer.parseInt(args[++i]) * 1000;
-      } else if (args[i].equals("-verbose")) { // found -verbose option
-        verbose = true;
-      } else if (i != args.length - 1) {
-        System.err.println(usage);
-        System.exit(-1);
-      } else
-        // root is required parameter
-        url = args[i];
-    }
-    // if (verbose) {
-    // LOGGER.setLevel(Level.FINE);
-    // }
-    ProtocolOutput out = http
-        .getProtocolOutput(new Text(url), new CrawlDatum());
-    Content content = out.getContent();
-    System.out.println("Status: " + out.getStatus());
-    if (content != null) {
-      System.out.println("Content Type: " + content.getContentType());
-      System.out.println("Content Length: "
-          + content.getMetadata().get(Response.CONTENT_LENGTH));
-      System.out.println("Content:");
-      String text = new String(content.getContent());
-      System.out.println(text);
-    }
-  }
-  protected abstract Response getResponse(URL url, CrawlDatum datum,
-      boolean followRedirects) throws ProtocolException, IOException;
-  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
-    return robots.getRobotRulesSet(this, url);
-  }
-  /**
-   * Transforming a String[] into a HashMap for faster searching
-   * @param input String[]
-   * @return a new HashMap
-   */
-  private HashMap arrayToMap(String[]input){
-    if (input==null ||input.length==0) {
-      return new HashMap();
-    }
-    HashMap hm=new HashMap();
-    for (int i=0;i<input.length;i++){
-      if (!"".equals(input[i].trim())){
-        hm.put(input[i],input[i]);
-      }
-    }
-    return hm;
-  }
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
deleted file mode 100644
index ff7ef5b..0000000
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
+++ /dev/null
@@ -1,40 +0,0 @@
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.http.api;
-// Nutch imports
-import org.apache.nutch.protocol.ProtocolException;
-public class HttpException extends ProtocolException {
-  public HttpException() {
-    super();
-  }
-  public HttpException(String message) {
-    super(message);
-  }
-  public HttpException(String message, Throwable cause) {
-    super(message, cause);
-  }
-  public HttpException(Throwable cause) {
-    super(cause);
-  }
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
deleted file mode 100644
index 185ca15..0000000
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
+++ /dev/null
@@ -1,167 +0,0 @@
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.http.api;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.RobotRulesParser;
-import crawlercommons.robots.BaseRobotRules;
- * This class is used for parsing robots for urls belonging to HTTP protocol. It
- * extends the generic {@link RobotRulesParser} class and contains Http protocol
- * specific implementation for obtaining the robots file.
- */
-public class HttpRobotRulesParser extends RobotRulesParser {
-  public static final Logger LOG = LoggerFactory
-      .getLogger(HttpRobotRulesParser.class);
-  protected boolean allowForbidden = false;
-  HttpRobotRulesParser() {
-  }
-  public HttpRobotRulesParser(Configuration conf) {
-    setConf(conf);
-  }
-  public void setConf(Configuration conf) {
-    super.setConf(conf);
-    allowForbidden = conf.getBoolean("http.robots.403.allow", true);
-  }
-  /** Compose unique key to store and access robot rules in cache for given URL */
-  protected static String getCacheKey(URL url) {
-    String protocol = url.getProtocol().toLowerCase(); // normalize to lower
-                                                       // case
-    String host = url.getHost().toLowerCase(); // normalize to lower case
-    int port = url.getPort();
-    if (port == -1) {
-      port = url.getDefaultPort();
-    }
-    /*
-     * Robot rules apply only to host, protocol, and port where robots.txt is
-     * hosted (cf. NUTCH-1752). Consequently
-     */
-    String cacheKey = protocol + ":" + host + ":" + port;
-    return cacheKey;
-  }
-  /**
-   * Get the rules from robots.txt which applies for the given {@code url}.
-   * Robot rules are cached for a unique combination of host, protocol, and
-   * port. If no rules are found in the cache, a HTTP request is send to fetch
-   * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
-   * rules are cached to avoid re-fetching and re-parsing it again.
-   * 
-   * @param http
-   *          The {@link Protocol} object
-   * @param url
-   *          URL robots.txt applies to
-   * 
-   * @return {@link BaseRobotRules} holding the rules from robots.txt
-   */
-  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
-    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
-      LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
-    }
-    String cacheKey = getCacheKey(url);
-    BaseRobotRules robotRules = CACHE.get(cacheKey);
-    if (robotRules != null) {
-      return robotRules; // cached rule
-    } else if (LOG.isTraceEnabled()) {
-      LOG.trace("cache miss " + url);
-    }
-    boolean cacheRule = true;
-    URL redir = null;
-    if (isWhiteListed(url)) {
-      // check in advance whether a host is whitelisted
-      // (we do not need to fetch robots.txt)
-      robotRules = EMPTY_RULES;
-"Whitelisted host found for: {}", url);
-"Ignoring robots.txt for all URLs from whitelisted host: {}",
-          url.getHost());
-    } else {
-      try {
-        Response response = ((HttpBase) http).getResponse(new URL(url,
-            "/robots.txt"), new CrawlDatum(), true);
-        // try one level of redirection ?
-        if (response.getCode() == 301 || response.getCode() == 302) {
-          String redirection = response.getHeader("Location");
-          if (redirection == null) {
-            // some versions of MS IIS are known to mangle this header
-            redirection = response.getHeader("location");
-          }
-          if (redirection != null) {
-            if (!redirection.startsWith("http")) {
-              // RFC says it should be absolute, but apparently it isn't
-              redir = new URL(url, redirection);
-            } else {
-              redir = new URL(redirection);
-            }
-            response = ((HttpBase) http).getResponse(redir, new CrawlDatum(),
-                true);
-          }
-        }
-        if (response.getCode() == 200) // found rules: parse them
-          robotRules = parseRules(url.toString(), response.getContent(),
-              response.getHeader("Content-Type"), agentNames);
-        else if ((response.getCode() == 403) && (!allowForbidden))
-          robotRules = FORBID_ALL_RULES; // use forbid all
-        else if (response.getCode() >= 500) {
-          cacheRule = false; // try again later to fetch robots.txt
-          robotRules = EMPTY_RULES;
-        } else
-          robotRules = EMPTY_RULES; // use default rules
-      } catch (Throwable t) {
-        if (LOG.isInfoEnabled()) {
-"Couldn't get robots.txt for " + url + ": " + t.toString());
-        }
-        cacheRule = false; // try again later to fetch robots.txt
-        robotRules = EMPTY_RULES;
-      }
-    }
-    if (cacheRule) {
-      CACHE.put(cacheKey, robotRules); // cache rules for host
-      if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
-        // cache also for the redirected host
-        CACHE.put(getCacheKey(redir), robotRules);
-      }
-    }
-    return robotRules;
-  }
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html
deleted file mode 100644
index 972bb3c..0000000
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html
+++ /dev/null
@@ -1,6 +0,0 @@
-<p>Common API used by HTTP plugins ({@link org.apache.nutch.protocol.http http},
-{@link org.apache.nutch.protocol.httpclient httpclient})</p>
diff --git a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/ b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/
deleted file mode 100644
index 23e4ef6..0000000
--- a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/
+++ /dev/null
@@ -1,123 +0,0 @@
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.http.api;
-import org.junit.Assert;
-import org.junit.Test;
-import crawlercommons.robots.BaseRobotRules;
- * JUnit test case which tests 1. that robots filtering is performed correctly
- * as per the agent name 2. that crawl delay is extracted correctly from the
- * robots file
- * 
- */
-public class TestRobotRulesParser {
-  private static final String CONTENT_TYPE = "text/plain";
-  private static final String SINGLE_AGENT = "Agent1";
-  private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
-  private static final String UNKNOWN_AGENT = "AgentABC";
-  private static final String CR = "\r";
-  private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR
-      + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c"
-      + CR
-      + "Crawl-delay: 10"
-      + CR // set crawl delay for Agent1 as 10 sec
-      + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh"
-      + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20"
-      + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no
-                                                                          // crawl
-                                                                          // delay
-                                                                          // for
-                                                                          // other
-                                                                          // agents
-  private static final String[] TEST_PATHS = new String[] {
-      "", "",
-      "", "",
-      "",
-      "" };
-  private static final boolean[] RESULTS = new boolean[] { false, // /a
-      false, // /a/bloh/foo.html
-      true, // /b
-      true, // /c
-      false, // /b/a/index.html
-      true // /foo/bar/baz.html
-  };
-  private HttpRobotRulesParser parser;
-  private BaseRobotRules rules;
-  public TestRobotRulesParser() {
-    parser = new HttpRobotRulesParser();
-  }
-  /**
-   * Test that the robots rules are interpreted correctly by the robots rules
-   * parser.
-   */
-  @Test
-  public void testRobotsAgent() {
-    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
-    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
-      Assert.assertTrue(
-          "testing on agent (" + SINGLE_AGENT + "), and " + "path "
-              + TEST_PATHS[counter] + " got "
-              + rules.isAllowed(TEST_PATHS[counter]),
-          rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
-    }
-    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
-    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
-      Assert.assertTrue(
-          "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
-              + TEST_PATHS[counter] + " got "
-              + rules.isAllowed(TEST_PATHS[counter]),
-          rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
-    }
-  }
-  /**
-   * Test that the crawl delay is extracted from the robots file for respective
-   * agent. If its not specified for a given agent, default value must be
-   * returned.
-   */
-  @Test
-  public void testCrawlDelay() {
-    // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
-    // returned by the parser
-    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
-    Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
-        (rules.getCrawlDelay() == 10000));
-    // for UNKNOWN_AGENT, the default crawl delay must be returned.
-    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
-    Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
-        (rules.getCrawlDelay() == Long.MIN_VALUE));
-  }
diff --git a/src/plugin/lib-nekohtml/build.xml b/src/plugin/lib-nekohtml/build.xml
deleted file mode 100644
index 4bca1af..0000000
--- a/src/plugin/lib-nekohtml/build.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-<?xml version="1.0"?>
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- See the License for the specific language governing permissions and
- limitations under the License.
-<project name="lib-nekohtml" default="jar">
-  <import file="../build-plugin.xml"/>
-  <!--
-   ! Override the compile and jar targets,
-   ! since there is nothing to compile here.
-   ! -->
-  <target name="compile" depends="init, resolve-default"/>
-  <target name="jar" depends="compile"/>
diff --git a/src/plugin/lib-nekohtml/ivy.xml b/src/plugin/lib-nekohtml/ivy.xml
deleted file mode 100644
index ed70b80..0000000
--- a/src/plugin/lib-nekohtml/ivy.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" ?>
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url=""/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-  <dependencies>
-    <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.19" conf="*->master"/>
-  </dependencies>
diff --git a/src/plugin/lib-nekohtml/plugin.xml b/src/plugin/lib-nekohtml/plugin.xml
deleted file mode 100644
index 513c9a7..0000000
--- a/src/plugin/lib-nekohtml/plugin.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- See the License for the specific language governing permissions and
- limitations under the License.
- ! NekoHTML is a simple HTML scanner and tag balancer that enables 
- ! application programmers to parse HTML documents and access the 
- ! information using standard XML interfaces.
- ! (
- ! 
- ! License :
- !-->
-   id="lib-nekohtml"
-   name="CyberNeko HTML Parser"
-   version="1.9.19"
-   provider-name="net.sourceforge.nekohtml">
-   <runtime>
-     <library name="nekohtml-1.9.19.jar">
-        <export name="*"/>
-     </library>
-   </runtime>
diff --git a/src/plugin/lib-regex-filter/build.xml b/src/plugin/lib-regex-filter/build.xml
deleted file mode 100644
index 9702ca2..0000000
--- a/src/plugin/lib-regex-filter/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- See the License for the specific language governing permissions and
- limitations under the License.
-<project name="lib-regex-filter" default="jar-core">
-  <import file="../build-plugin.xml"/>
diff --git a/src/plugin/lib-regex-filter/ivy.xml b/src/plugin/lib-regex-filter/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/lib-regex-filter/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url=""/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-  <dependencies>
-  </dependencies>
diff --git a/src/plugin/lib-regex-filter/plugin.xml b/src/plugin/lib-regex-filter/plugin.xml
deleted file mode 100644
index 42de8f1..0000000
--- a/src/plugin/lib-regex-filter/plugin.xml
+++ /dev/null
@@ -1,33 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- See the License for the specific language governing permissions and
- limitations under the License.
- ! A common framework for RegExp based URL filters
- !-->
-   id="lib-regex-filter"
-   name="Regex URL Filter Framework"
-   version="1.0"
-   provider-name="org.apache.nutch">
-   <runtime>
-     <library name="lib-regex-filter.jar">
-        <export name="*"/>
-     </library>
-   </runtime>
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/
deleted file mode 100644
index e408586..0000000
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/
+++ /dev/null
@@ -1,102 +0,0 @@
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.urlfilter.api;
- * A generic regular expression rule.
- * 
- * @author J&eacute;r&ocirc;me Charron
- */
-public abstract class RegexRule {
-  private final boolean sign;
-  private final String hostOrDomain;
-  private final String regex;
-  /**
-   * Constructs a new regular expression rule.
-   * 
-   * @param sign
-   *          specifies if this rule must filter-in or filter-out. A
-   *          <code>true</code> value means that any url matching this rule must
-   *          be accepted, a <code>false</code> value means that any url
-   *          matching this rule must be rejected.
-   * @param regex
-   *          is the regular expression used for matching (see
-   *          {@link #match(String)} method).
-   */
-  protected RegexRule(boolean sign, String regex) {
-    this(sign, regex, null);
-  }
-  /**
-   * Constructs a new regular expression rule.
-   * 
-   * @param sign
-   *          specifies if this rule must filter-in or filter-out. A
-   *          <code>true</code> value means that any url matching this rule must
-   *          be accepted, a <code>false</code> value means that any url
-   *          matching this rule must be rejected.
-   * @param regex
-   *          is the regular expression used for matching (see
-   *          {@link #match(String)} method).
-   * @param hostOrDomain
-   *          the host or domain to which this regex belongs
-   */
-  protected RegexRule(boolean sign, String regex, String hostOrDomain) {
-    this.sign = sign;
-    this.hostOrDomain = hostOrDomain;
-    this.regex = regex;
-  }
-  /**
-   * Return if this rule is used for filtering-in or out.
-   * 
-   * @return <code>true</code> if any url matching this rule must be accepted,
-   *         otherwise <code>false</code>.
-   */
-  protected boolean accept() {
-    return sign;
-  }
-  /**
-   * Return if this rule is used for filtering-in or out.
-   *
-   * @return host or domain this regex rule belongs to
-   */
-  protected String hostOrDomain() { return hostOrDomain; }
-  /**
-   * Return if this rule's regex.
-   *
-   * @return this regex
-   */
-  protected String regex() { return regex; }
-  /**
-   * Checks if a url matches this rule.
-   * 
-   * @param url
-   *          is the url to check.
-   * @return <code>true</code> if the specified url matches this rule, otherwise
-   *         <code>false</code>.
-   */
-  protected abstract boolean match(String url);
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/
deleted file mode 100644
index f5cc081..0000000
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/
+++ /dev/null
@@ -1,315 +0,0 @@
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.urlfilter.api;
-// JDK imports
-import java.util.List;
-import java.util.ArrayList;
-// Commons Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-// Nutch imports
-import org.apache.nutch.util.URLUtil;
- * Generic {@link URL filter} based on regular
- * expressions.
- * 
- * <p>
- * The regular expressions rules are expressed in a file. The file of rules is
- * determined for each implementation using the
- * {@link #getRulesReader(Configuration conf)} method.
- * </p>
- * 
- * <p>
- * The format of this file is made of many rules (one per line):<br/>
- * <code>
- * [+-]&lt;regex&gt;
- * </code><br/>
- * where plus (<code>+</code>)means go ahead and index it and minus (
- * <code>-</code>)means no.
- * </p>
- * 
- * @author J&eacute;r&ocirc;me Charron
- */
-public abstract class RegexURLFilterBase implements URLFilter {
-  /** My logger */
-  private final static Logger LOG = LoggerFactory
-      .getLogger(RegexURLFilterBase.class);
-  /** An array of applicable rules */
-  private List<RegexRule> rules;
-  /** The current configuration */
-  private Configuration conf;
-  /**
-   * Constructs a new empty RegexURLFilterBase
-   */
-  public RegexURLFilterBase() {
-  }
-  /**
-   * Constructs a new RegexURLFilter and init it with a file of rules.
-   * 
-   * @param filename
-   *          is the name of rules file.
-   */
-  public RegexURLFilterBase(File filename) throws IOException,
-      IllegalArgumentException {
-    this(new FileReader(filename));
-  }
-  /**
-   * Constructs a new RegexURLFilter and inits it with a list of rules.
-   * 
-   * @param rules
-   *          string with a list of rules, one rule per line
-   * @throws IOException
-   * @throws IllegalArgumentException
-   */
-  public RegexURLFilterBase(String rules) throws IOException,
-      IllegalArgumentException {
-    this(new StringReader(rules));
-  }
-  /**
-   * Constructs a new RegexURLFilter and init it with a Reader of rules.
-   * 
-   * @param reader
-   *          is a reader of rules.
-   */
-  protected RegexURLFilterBase(Reader reader) throws IOException,
-      IllegalArgumentException {
-    rules = readRules(reader);
-  }
-  /**
-   * Creates a new {@link RegexRule}.
-   * 
-   * @param sign
-   *          of the regular expression. A <code>true</code> value means that
-   *          any URL matching this rule must be included, whereas a
-   *          <code>false</code> value means that any URL matching this rule
-   *          must be excluded.
-   * @param regex
-   *          is the regular expression associated to this rule.
-   */
-  protected abstract RegexRule createRule(boolean sign, String regex);
-  /**
-   * Creates a new {@link RegexRule}.
-   * @param 
-   *        sign of the regular expression.
-   *        A <code>true</code> value means that any URL matching this rule
-   *        must be included, whereas a <code>false</code>
-   *        value means that any URL matching this rule must be excluded.
-   * @param regex
-   *        is the regular expression associated to this rule.
-   * @param hostOrDomain
-   *        the host or domain to which this regex belongs
-   */
-  protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain);
-  /**
-   * Returns the name of the file of rules to use for a particular
-   * implementation.
-   * 
-   * @param conf
-   *          is the current configuration.
-   * @return the name of the resource containing the rules to use.
-   */
-  protected abstract Reader getRulesReader(Configuration conf)
-      throws IOException;
-  /*
-   * -------------------------- * <implementation:URLFilter> *
-   * --------------------------
-   */
-  // Inherited Javadoc
-  public String filter(String url) {
-    String host = URLUtil.getHost(url);
-    String domain = null;
-    try {
-      domain = URLUtil.getDomainName(url);
-    } catch (MalformedURLException e) {
-      // shouldnt happen here right?
-    }
-    if (LOG.isDebugEnabled()) {
-      LOG.debug("URL belongs to host " + host + " and domain " + domain);
-    }
-    for (RegexRule rule : rules) {
-      // Skip the skip for rules that don't share the same host and domain
-      if (rule.hostOrDomain() != null &&
-            !rule.hostOrDomain().equals(host) &&
-            !rule.hostOrDomain().equals(domain)) {
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain());
-        }
-        continue;
-      }
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain);
-      }
-      if (rule.match(url)) {
-        return rule.accept() ? url : null;
-      }
-    }
-    ;
-    return null;
-  }
-  /*
-   * --------------------------- * </implementation:URLFilter> *
-   * ---------------------------
-   */
-  /*
-   * ----------------------------- * <implementation:Configurable> *
-   * -----------------------------
-   */
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    Reader reader = null;
-    try {
-      reader = getRulesReader(conf);
-    } catch (Exception e) {
-      if (LOG.isErrorEnabled()) {
-        LOG.error(e.getMessage());
-      }
-      throw new RuntimeException(e.getMessage(), e);
-    }
-    try {
-      rules = readRules(reader);
-    } catch (IOException e) {
-      if (LOG.isErrorEnabled()) {
-        LOG.error(e.getMessage());
-      }
-      throw new RuntimeException(e.getMessage(), e);
-    }
-  }
-  public Configuration getConf() {
-    return this.conf;
-  }
-  /*
-   * ------------------------------ * </implementation:Configurable> *
-   * ------------------------------
-   */
-  /**
-   * Read the specified file of rules.
-   * 
-   * @param reader
-   *          is a reader of regular expressions rules.
-   * @return the corresponding {@RegexRule rules}.
-   */
-  private List<RegexRule> readRules(Reader reader) throws IOException,
-      IllegalArgumentException {
-    BufferedReader in = new BufferedReader(reader);
-    List<RegexRule> rules = new ArrayList<RegexRule>();
-    String line;
-    String hostOrDomain = null;
-    while ((line = in.readLine()) != null) {
-      if (line.length() == 0) {
-        continue;
-      }
-      char first = line.charAt(0);
-      boolean sign = false;
-      switch (first) {
-      case '+':
-        sign = true;
-        break;
-      case '-':
-        sign = false;
-        break;
-      case ' ':
-      case '\n':
-      case '#': // skip blank & comment lines
-        continue;
-      case '>':
-        hostOrDomain = line.substring(1).trim();
-        continue;
-      case '<':
-        hostOrDomain = null;
-        continue;
-      default:
-        throw new IOException("Invalid first character: " + line);
-      }
-      String regex = line.substring(1);
-      if (LOG.isTraceEnabled()) {
-        LOG.trace("Adding rule [" + regex + "] for " + hostOrDomain);
-      }
-      RegexRule rule = createRule(sign, regex, hostOrDomain);
-      rules.add(rule);
-    }
-    return rules;
-  }
-  /**
-   * Filter the standard input using a RegexURLFilterBase.
-   * 
-   * @param filter
-   *          is the RegexURLFilterBase to use for filtering the standard input.
-   * @param args
-   *          some optional parameters (not used).
-   */
-  public static void main(RegexURLFilterBase filter, String args[])
-      throws IOException, IllegalArgumentException {
-    BufferedReader in = new BufferedReader(new InputStreamReader(;
-    String line;
-    while ((line = in.readLine()) != null) {
-      String out = filter.filter(line);
-      if (out != null) {
-        System.out.print("+");
-        System.out.println(out);
-      } else {
-        System.out.print("-");
-        System.out.println(line);
-      }
-    }
-  }
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/
deleted file mode 100644
index b849353..0000000
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/
+++ /dev/null
@@ -1,23 +0,0 @@
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- * Generic {@link URL filter} library,
- * abstracting away from regular expression implementations.
- */
-package org.apache.nutch.urlfilter.api;
diff --git a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/ b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/
deleted file mode 100644
index 0b58231..0000000
--- a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/
+++ /dev/null
@@ -1,134 +0,0 @@
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.urlfilter.api;
-// JDK imports
-import java.util.ArrayList;
-import java.util.List;
-import org.junit.Assert;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-// Nutch imports
- * JUnit based test of class <code>RegexURLFilterBase</code>.
- * 
- * @author J&eacute;r&ocirc;me Charron
- */
-public abstract class RegexURLFilterBaseTest {
-  /** My logger */
-  protected static final Logger LOG = LoggerFactory
-      .getLogger(RegexURLFilterBaseTest.class);
-  private final static String SEPARATOR = System.getProperty("file.separator");
-  private final static String SAMPLES = System.getProperty("", ".");
-  protected abstract URLFilter getURLFilter(Reader rules);
-  protected void bench(int loops, String file) {
-    try {
-      bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
-          new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
-    } catch (Exception e) {
-    }
-  }
-  protected void bench(int loops, Reader rules, Reader urls) {
-    long start = System.currentTimeMillis();
-    try {
-      URLFilter filter = getURLFilter(rules);
-      FilteredURL[] expected = readURLFile(urls);
-      for (int i = 0; i < loops; i++) {
-        test(filter, expected);
-      }
-    } catch (Exception e) {
-    }
-"bench time (" + loops + ") "
-        + (System.currentTimeMillis() - start) + "ms");
-  }
-  protected void test(String file) {
-    try {
-      test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
-          new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
-    } catch (Exception e) {
-    }
-  }
-  protected void test(Reader rules, Reader urls) {
-    try {
-      test(getURLFilter(rules), readURLFile(urls));
-    } catch (Exception e) {
-    }
-  }
-  protected void test(URLFilter filter, FilteredURL[] expected) {
-    for (int i = 0; i < expected.length; i++) {
-      String result = filter.filter(expected[i].url);
-      if (result != null) {
-        Assert.assertTrue(expected[i].url, expected[i].sign);
-      } else {
-        Assert.assertFalse(expected[i].url, expected[i].sign);
-      }
-    }
-  }
-  private static FilteredURL[] readURLFile(Reader reader) throws IOException {
-    BufferedReader in = new BufferedReader(reader);
-    List<FilteredURL> list = new ArrayList<FilteredURL>();
-    String line;
-    while ((line = in.readLine()) != null) {
-      if (line.length() != 0) {
-        list.add(new FilteredURL(line));
-      }
-    }
-    return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]);
-  }
-  private static class FilteredURL {
-    boolean sign;
-    String url;
-    FilteredURL(String line) {
-      switch (line.charAt(0)) {
-      case '+':
-        sign = true;
-        break;
-      case '-':
-        sign = false;
-        break;
-      default:
-        // Simply ignore...
-      }
-      url = line.substring(1);
-    }
-  }
diff --git a/src/plugin/lib-selenium/build-ivy.xml b/src/plugin/lib-selenium/build-ivy.xml
deleted file mode 100644
index 3abcf6d..0000000
--- a/src/plugin/lib-selenium/build-ivy.xml
+++ /dev/null
@@ -1,54 +0,0 @@
-<?xml version="1.0"?>
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- See the License for the specific language governing permissions and
- limitations under the License.
-<project name="lib-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
-    <property name="ivy.install.version" value="2.1.0" />
-    <condition property="ivy.home" value="${env.IVY_HOME}">
-      <isset property="env.IVY_HOME" />
-    </condition>
-    <property name="ivy.home" value="${user.home}/.ant" />
-    <property name="ivy.checksums" value="" />
-    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
-    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
-    <target name="download-ivy" unless="offline">
-        <mkdir dir="${ivy.jar.dir}"/>
-        <!-- download Ivy from web site so that it can be used even without any special installation -->
-        <get src="${ivy.install.version}/ivy-${ivy.install.version}.jar" 
-             dest="${ivy.jar.file}" usetimestamp="true"/>
-    </target>
-    <target name="init-ivy" depends="download-ivy">
-      <!-- try to load ivy here from ivy home, in case the user has not already dropped
-              it into ant's lib dir (note that the latter copy will always take precedence).
-              We will not fail as long as local lib dir exists (it may be empty) and
-              ivy is in at least one of ant's lib dir or the local lib dir. -->
-        <path id="ivy.lib.path">
-            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
-        </path>
-        <taskdef resource="org/apache/ivy/ant/antlib.xml"
-                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
-    </target>
-  <target name="deps-jar" depends="init-ivy">
-    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
-  </target>
diff --git a/src/plugin/lib-selenium/build.xml b/src/plugin/lib-selenium/build.xml
deleted file mode 100644
index 7c6d98d..0000000
--- a/src/plugin/lib-selenium/build.xml
+++ /dev/null
@@ -1,28 +0,0 @@
-<?xml version="1.0"?>
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- See the License for the specific language governing permissions and
- limitations under the License.
-<project name="lib-selenium" default="jar-core">
-  <import file="../build-plugin.xml"/>
-  <!-- Add compilation dependencies to classpath -->
-  <path id="plugin.deps">    
-    <fileset dir="${nutch.root}/build">
-      <include name="**/lib-http/*.jar" />
-    </fileset>
-  </path>
diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.txt b/src/plugin/lib-selenium/howto_upgrade_selenium.txt
deleted file mode 100644
index 1892a62..0000000
--- a/src/plugin/lib-selenium/howto_upgrade_selenium.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml
-2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
-   To get a list of dependencies and their versions execute:
-    $ ant -f ./build-ivy.xml
-    $ ls lib | sed 's/^/     <library name="/g' | sed 's/$/">\n       <export name="*"\/>\n     <\/library>/g'
-   Note that all dependent libraries are exported for a "library" plugin ("lib-selenium").
-   N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows
-   $ brew install gnu-sed --with-default-names
-   You can then restart your terminal and the Regex + Sed command should work just fine!
diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml
deleted file mode 100644
index 701b725..0000000
--- a/src/plugin/lib-selenium/ivy.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-<?xml version="1.0" ?>
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url=""/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-  <configurations>
-    <include file="../../../ivy/ivy-configurations.xml"/>
-  </configurations>
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-  <dependencies>
-    <!-- begin selenium dependencies -->
-    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.48.2" />
-    <dependency org="com.opera" name="operadriver" rev="1.5">
-      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
-    </dependency>
-    <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
-      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
-      <exclude org="org.seleniumhq.selenium" name="selenium-java" />
-    </dependency>
-    <!-- end selenium dependencies -->
-  </dependencies>
diff --git a/src/plugin/lib-selenium/plugin.xml b/src/plugin/lib-selenium/plugin.xml
deleted file mode 100644
index a86d665..0000000
--- a/src/plugin/lib-selenium/plugin.xml
+++ /dev/null
@@ -1,175 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- See the License for the specific language governing permissions and
- limitations under the License.
- ! A common framework for http protocol implementations
- !-->
-   id="lib-selenium"
-   name="HTTP Framework"
-   version="1.0"
-   provider-name="org.apache.nutch">
-   <runtime>
-     <library name="lib-selenium.jar">
-        <export name="*"/>
-     </library>
-     <!-- all classes from dependent libraries are exported -->
-     <library name="cglib-nodep-2.1_3.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-codec-1.10.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-collections-3.2.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-exec-1.3.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-io-2.4.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-jxpath-1.3.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-lang3-3.4.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-logging-1.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="cssparser-0.9.16.jar">
-       <export name="*"/>
-     </library>
-     <library name="gson-2.3.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="guava-18.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="htmlunit-2.18.jar">
-       <export name="*"/>
-     </library>
-     <library name="htmlunit-core-js-2.17.jar">
-       <export name="*"/>
-     </library>
-     <library name="httpclient-4.5.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="httpcore-4.4.3.jar">
-       <export name="*"/>
-     </library>
-     <library name="httpmime-4.5.jar">
-       <export name="*"/>
-     </library>
-     <library name="ini4j-0.5.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="jetty-io-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="jetty-util-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="jna-4.1.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="jna-platform-4.1.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="nekohtml-1.9.22.jar">
-       <export name="*"/>
-     </library>
-     <library name="netty-3.5.2.Final.jar">
-       <export name="*"/>
-     </library>
-     <library name="operadriver-1.5.jar">
-       <export name="*"/>
-     </library>
-     <library name="operalaunchers-1.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="phantomjsdriver-1.2.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="protobuf-java-2.4.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="sac-1.3.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-api-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-chrome-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-edge-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-firefox-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-htmlunit-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-ie-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-java-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-leg-rc-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-remote-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-safari-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-support-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="serializer-2.7.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="webbit-0.4.14.jar">
-       <export name="*"/>
-     </library>
-     <library name="websocket-api-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="websocket-client-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="websocket-common-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="xalan-2.7.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="xercesImpl-2.11.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="xml-apis-1.4.01.jar">
-       <export name="*"/>
-     </library>
-   </runtime>