You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:22 UTC

[06/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
new file mode 100644
index 0000000..afcf24a
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Based on EasySSLProtocolSocketFactory from commons-httpclient:
+ * 
+ * $Header:
+ * /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/contrib/org/apache/commons/httpclient/contrib/ssl/DummySSLProtocolSocketFactory.java,v
+ * 1.7 2004/06/11 19:26:27 olegk Exp $ $Revision$ $Date: 2005-02-26 05:01:52
+ * -0800 (Sat, 26 Feb 2005) $
+ */
+
+package org.apache.nutch.protocol.httpclient;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.Socket;
+import java.net.UnknownHostException;
+
+import org.apache.commons.httpclient.ConnectTimeoutException;
+import org.apache.commons.httpclient.HttpClientError;
+import org.apache.commons.httpclient.params.HttpConnectionParams;
+import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory;
+import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.net.ssl.SSLContext;
+import javax.net.ssl.TrustManager;
+
+public class DummySSLProtocolSocketFactory implements
+    SecureProtocolSocketFactory {
+
+  /** Logger object for this class. */
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DummySSLProtocolSocketFactory.class);
+
+  private SSLContext sslcontext = null;
+
+  /**
+   * Constructor for DummySSLProtocolSocketFactory.
+   */
+  public DummySSLProtocolSocketFactory() {
+    super();
+  }
+
+  private static SSLContext createEasySSLContext() {
+    try {
+      SSLContext context = SSLContext.getInstance("SSL");
+      context.init(null,
+          new TrustManager[] { new DummyX509TrustManager(null) }, null);
+      return context;
+    } catch (Exception e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.getMessage(), e);
+      }
+      throw new HttpClientError(e.toString());
+    }
+  }
+
+  private SSLContext getSSLContext() {
+    if (this.sslcontext == null) {
+      this.sslcontext = createEasySSLContext();
+    }
+    return this.sslcontext;
+  }
+
+  /**
+   * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int,InetAddress,int)
+   */
+  public Socket createSocket(String host, int port, InetAddress clientHost,
+      int clientPort) throws IOException, UnknownHostException {
+
+    return getSSLContext().getSocketFactory().createSocket(host, port,
+        clientHost, clientPort);
+  }
+
+  /**
+   * Attempts to get a new socket connection to the given host within the given
+   * time limit.
+   * <p>
+   * To circumvent the limitations of older JREs that do not support connect
+   * timeout a controller thread is executed. The controller thread attempts to
+   * create a new socket within the given limit of time. If socket constructor
+   * does not return until the timeout expires, the controller terminates and
+   * throws an {@link ConnectTimeoutException}
+   * </p>
+   * 
+   * @param host
+   *          the host name/IP
+   * @param port
+   *          the port on the host
+   * @param localAddress
+   *          the local host name/IP to bind the socket to
+   * @param localPort
+   *          the port on the local machine
+   * @param params
+   *          {@link HttpConnectionParams Http connection parameters}
+   * 
+   * @return Socket a new socket
+   * 
+   * @throws IOException
+   *           if an I/O error occurs while creating the socket
+   * @throws UnknownHostException
+   *           if the IP address of the host cannot be determined
+   */
+  public Socket createSocket(final String host, final int port,
+      final InetAddress localAddress, final int localPort,
+      final HttpConnectionParams params) throws IOException,
+      UnknownHostException, ConnectTimeoutException {
+    if (params == null) {
+      throw new IllegalArgumentException("Parameters may not be null");
+    }
+    int timeout = params.getConnectionTimeout();
+    if (timeout == 0) {
+      return createSocket(host, port, localAddress, localPort);
+    } else {
+      // To be eventually deprecated when migrated to Java 1.4 or above
+      return ControllerThreadSocketFactory.createSocket(this, host, port,
+          localAddress, localPort, timeout);
+    }
+  }
+
+  /**
+   * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int)
+   */
+  public Socket createSocket(String host, int port) throws IOException,
+      UnknownHostException {
+    return getSSLContext().getSocketFactory().createSocket(host, port);
+  }
+
+  /**
+   * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(Socket,String,int,boolean)
+   */
+  public Socket createSocket(Socket socket, String host, int port,
+      boolean autoClose) throws IOException, UnknownHostException {
+    return getSSLContext().getSocketFactory().createSocket(socket, host, port,
+        autoClose);
+  }
+
+  public boolean equals(Object obj) {
+    return ((obj != null) && obj.getClass().equals(
+        DummySSLProtocolSocketFactory.class));
+  }
+
+  public int hashCode() {
+    return DummySSLProtocolSocketFactory.class.hashCode();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
new file mode 100644
index 0000000..b5509cc
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Based on EasyX509TrustManager from commons-httpclient.
+ */
+
+package org.apache.nutch.protocol.httpclient;
+
+import java.security.KeyStore;
+import java.security.KeyStoreException;
+import java.security.NoSuchAlgorithmException;
+import java.security.cert.CertificateException;
+import java.security.cert.X509Certificate;
+
+import javax.net.ssl.TrustManagerFactory;
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.X509TrustManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DummyX509TrustManager implements X509TrustManager {
+  private X509TrustManager standardTrustManager = null;
+
+  /** Logger object for this class. */
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DummyX509TrustManager.class);
+
+  /**
+   * Constructor for DummyX509TrustManager.
+   */
+  public DummyX509TrustManager(KeyStore keystore)
+      throws NoSuchAlgorithmException, KeyStoreException {
+    super();
+    String algo = TrustManagerFactory.getDefaultAlgorithm();
+    TrustManagerFactory factory = TrustManagerFactory.getInstance(algo);
+    factory.init(keystore);
+    TrustManager[] trustmanagers = factory.getTrustManagers();
+    if (trustmanagers.length == 0) {
+      throw new NoSuchAlgorithmException(algo + " trust manager not supported");
+    }
+    this.standardTrustManager = (X509TrustManager) trustmanagers[0];
+  }
+
+  /**
+   * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
+   *      String)
+   */
+  public boolean isClientTrusted(X509Certificate[] certificates) {
+    return true;
+  }
+
+  /**
+   * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
+   *      String)
+   */
+  public boolean isServerTrusted(X509Certificate[] certificates) {
+    return true;
+  }
+
+  /**
+   * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
+   */
+  public X509Certificate[] getAcceptedIssuers() {
+    return this.standardTrustManager.getAcceptedIssuers();
+  }
+
+  public void checkClientTrusted(X509Certificate[] arg0, String arg1)
+      throws CertificateException {
+    // do nothing
+
+  }
+
+  public void checkServerTrusted(X509Certificate[] arg0, String arg1)
+      throws CertificateException {
+    // do nothing
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/Http.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/Http.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/Http.java
new file mode 100644
index 0000000..75506ce
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/Http.java
@@ -0,0 +1,572 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+// JDK imports
+import java.io.InputStream;
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.xml.sax.SAXException;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Node;
+
+// Slf4j Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// HTTP Client imports
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HostConfiguration;
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
+import org.apache.commons.httpclient.NTCredentials;
+import org.apache.commons.httpclient.auth.AuthScope;
+import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
+import org.apache.commons.httpclient.protocol.Protocol;
+import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
+// NUTCH-1929 Consider implementing dependency injection for crawl HTTPS sites that use self signed certificates
+//import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;
+
+import org.apache.commons.lang.StringUtils;
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * <p>
+ * This class is a protocol plugin that configures an HTTP client for Basic,
+ * Digest and NTLM authentication schemes for web server as well as proxy
+ * server. It takes care of HTTPS protocol as well as cookies in a single fetch
+ * session.
+ * </p>
+ * <p>
+ * Documentation can be found on the Nutch <a
+ * href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes"
+ * >HttpAuthenticationSchemes</a> wiki page.
+ * </p>
+ * <p>
+ * The original description of the motivation to support <a
+ * href="https://wiki.apache.org/nutch/HttpPostAuthentication"
+ * >HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally
+ * HttpPostAuthentication development is documented at the <a
+ * href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira
+ * issue.
+ * 
+ * @author Susam Pal
+ */
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
+
+  // Since the Configuration has not yet been set,
+  // then an unconfigured client is returned.
+  private static HttpClient client = new HttpClient(connectionManager);
+  private static String defaultUsername;
+  private static String defaultPassword;
+  private static String defaultRealm;
+  private static String defaultScheme;
+  private static String authFile;
+  private static String agentHost;
+  private static boolean authRulesRead = false;
+  private static Configuration conf;
+
+  private int maxThreadsTotal = 10;
+
+  private String proxyUsername;
+  private String proxyPassword;
+  private String proxyRealm;
+
+  private static HttpFormAuthConfigurer formConfigurer;
+
+  /**
+   * Returns the configured HTTP client.
+   * 
+   * @return HTTP client
+   */
+  static synchronized HttpClient getClient() {
+    return client;
+  }
+
+  /**
+   * Constructs this plugin.
+   */
+  public Http() {
+    super(LOG);
+  }
+
+  /**
+   * Reads the configuration from the Nutch configuration files and sets the
+   * configuration.
+   * 
+   * @param conf
+   *          Configuration
+   */
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    this.conf = conf;
+    this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
+    this.proxyUsername = conf.get("http.proxy.username", "");
+    this.proxyPassword = conf.get("http.proxy.password", "");
+    this.proxyRealm = conf.get("http.proxy.realm", "");
+    agentHost = conf.get("http.agent.host", "");
+    authFile = conf.get("http.auth.file", "");
+    configureClient();
+    try {
+      setCredentials();
+    } catch (Exception ex) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error("Could not read " + authFile + " : " + ex.getMessage());
+      }
+    }
+  }
+
+  /**
+   * Main method.
+   * 
+   * @param args
+   *          Command line arguments
+   */
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+
+  /**
+   * Fetches the <code>url</code> with a configured HTTP client and gets the
+   * response.
+   * 
+   * @param url
+   *          URL to be fetched
+   * @param datum
+   *          Crawl data
+   * @param redirect
+   *          Follow redirects if and only if true
+   * @return HTTP response
+   */
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    resolveCredentials(url);
+    return new HttpResponse(this, url, datum, redirect);
+  }
+
+  /**
+   * Configures the HTTP client
+   */
+  private void configureClient() {
+
+    // Set up an HTTPS socket factory that accepts self-signed certs.
+    // ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
+    ProtocolSocketFactory factory = new DummySSLProtocolSocketFactory();
+    Protocol https = new Protocol("https", factory, 443);
+    Protocol.registerProtocol("https", https);
+
+    HttpConnectionManagerParams params = connectionManager.getParams();
+    params.setConnectionTimeout(timeout);
+    params.setSoTimeout(timeout);
+    params.setSendBufferSize(BUFFER_SIZE);
+    params.setReceiveBufferSize(BUFFER_SIZE);
+
+    // --------------------------------------------------------------------------------
+    // NUTCH-1836: Modification to increase the number of available connections
+    // for multi-threaded crawls.
+    // --------------------------------------------------------------------------------
+    params.setMaxTotalConnections(conf.getInt(
+        "mapred.tasktracker.map.tasks.maximum", 5)
+        * conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
+
+    // Also set max connections per host to maxThreadsTotal since all threads
+    // might be used to fetch from the same host - otherwise timeout errors can
+    // occur
+    params.setDefaultMaxConnectionsPerHost(conf.getInt(
+        "fetcher.threads.fetch", maxThreadsTotal));
+
+    // executeMethod(HttpMethod) seems to ignore the connection timeout on the
+    // connection manager.
+    // set it explicitly on the HttpClient.
+    client.getParams().setConnectionManagerTimeout(timeout);
+
+    HostConfiguration hostConf = client.getHostConfiguration();
+    ArrayList<Header> headers = new ArrayList<Header>();
+    // Set the User Agent in the header
+    // headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941
+    // prefer English
+    headers.add(new Header("Accept-Language", acceptLanguage));
+    // prefer UTF-8
+    headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
+    // prefer understandable formats
+    headers
+        .add(new Header(
+            "Accept",
+            "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
+    // accept gzipped content
+    headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
+    hostConf.getParams().setParameter("http.default-headers", headers);
+
+    // HTTP proxy server details
+    if (useProxy) {
+      hostConf.setProxy(proxyHost, proxyPort);
+
+      if (proxyUsername.length() > 0) {
+
+        AuthScope proxyAuthScope = getAuthScope(this.proxyHost, this.proxyPort,
+            this.proxyRealm);
+
+        NTCredentials proxyCredentials = new NTCredentials(this.proxyUsername,
+            this.proxyPassword, Http.agentHost, this.proxyRealm);
+
+        client.getState().setProxyCredentials(proxyAuthScope, proxyCredentials);
+      }
+    }
+
+  }
+
+  /**
+   * Reads authentication configuration file (defined as 'http.auth.file' in
+   * Nutch configuration file) and sets the credentials for the configured
+   * authentication scopes in the HTTP client object.
+   * 
+   * @throws ParserConfigurationException
+   *           If a document builder can not be created.
+   * @throws SAXException
+   *           If any parsing error occurs.
+   * @throws IOException
+   *           If any I/O error occurs.
+   */
+  private static synchronized void setCredentials()
+      throws ParserConfigurationException, SAXException, IOException {
+
+    if (authRulesRead)
+      return;
+
+    authRulesRead = true; // Avoid re-attempting to read
+
+    InputStream is = conf.getConfResourceAsInputStream(authFile);
+    if (is != null) {
+      Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
+          .parse(is);
+
+      Element rootElement = doc.getDocumentElement();
+      if (!"auth-configuration".equals(rootElement.getTagName())) {
+        if (LOG.isWarnEnabled())
+          LOG.warn("Bad auth conf file: root element <"
+              + rootElement.getTagName() + "> found in " + authFile
+              + " - must be <auth-configuration>");
+      }
+
+      // For each set of credentials
+      NodeList credList = rootElement.getChildNodes();
+      for (int i = 0; i < credList.getLength(); i++) {
+        Node credNode = credList.item(i);
+        if (!(credNode instanceof Element))
+          continue;
+
+        Element credElement = (Element) credNode;
+        if (!"credentials".equals(credElement.getTagName())) {
+          if (LOG.isWarnEnabled())
+            LOG.warn("Bad auth conf file: Element <" + credElement.getTagName()
+                + "> not recognized in " + authFile
+                + " - expected <credentials>");
+          continue;
+        }
+
+        String authMethod = credElement.getAttribute("authMethod");
+        // read http form post auth info
+        if (StringUtils.isNotBlank(authMethod)) {
+          formConfigurer = readFormAuthConfigurer(credElement, authMethod);
+          continue;
+        }
+
+        String username = credElement.getAttribute("username");
+        String password = credElement.getAttribute("password");
+
+        // For each authentication scope
+        NodeList scopeList = credElement.getChildNodes();
+        for (int j = 0; j < scopeList.getLength(); j++) {
+          Node scopeNode = scopeList.item(j);
+          if (!(scopeNode instanceof Element))
+            continue;
+
+          Element scopeElement = (Element) scopeNode;
+
+          if ("default".equals(scopeElement.getTagName())) {
+
+            // Determine realm and scheme, if any
+            String realm = scopeElement.getAttribute("realm");
+            String scheme = scopeElement.getAttribute("scheme");
+
+            // Set default credentials
+            defaultUsername = username;
+            defaultPassword = password;
+            defaultRealm = realm;
+            defaultScheme = scheme;
+
+            if (LOG.isTraceEnabled()) {
+              LOG.trace("Credentials - username: " + username
+                  + "; set as default" + " for realm: " + realm + "; scheme: "
+                  + scheme);
+            }
+
+          } else if ("authscope".equals(scopeElement.getTagName())) {
+
+            // Determine authentication scope details
+            String host = scopeElement.getAttribute("host");
+            int port = -1; // For setting port to AuthScope.ANY_PORT
+            try {
+              port = Integer.parseInt(scopeElement.getAttribute("port"));
+            } catch (Exception ex) {
+              // do nothing, port is already set to any port
+            }
+            String realm = scopeElement.getAttribute("realm");
+            String scheme = scopeElement.getAttribute("scheme");
+
+            // Set credentials for the determined scope
+            AuthScope authScope = getAuthScope(host, port, realm, scheme);
+            NTCredentials credentials = new NTCredentials(username, password,
+                agentHost, realm);
+
+            client.getState().setCredentials(authScope, credentials);
+
+            if (LOG.isTraceEnabled()) {
+              LOG.trace("Credentials - username: " + username
+                  + "; set for AuthScope - " + "host: " + host + "; port: "
+                  + port + "; realm: " + realm + "; scheme: " + scheme);
+            }
+
+          } else {
+            if (LOG.isWarnEnabled())
+              LOG.warn("Bad auth conf file: Element <"
+                  + scopeElement.getTagName() + "> not recognized in "
+                  + authFile + " - expected <authscope>");
+          }
+        }
+        is.close();
+      }
+    }
+  }
+
+  /**
+   * <auth-configuration> <credentials authMethod="formAuth" loginUrl="loginUrl"
+   * loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field
+   * name="username" value="user1"/> </loginPostData> <additionalPostHeaders>
+   * <field name="header1" value="vaule1"/> </additionalPostHeaders>
+   * <removedFormFields> <field name="header1"/> </removedFormFields>
+   * </credentials> </auth-configuration>
+   */
+  private static HttpFormAuthConfigurer readFormAuthConfigurer(
+      Element credElement, String authMethod) {
+    if ("formAuth".equals(authMethod)) {
+      HttpFormAuthConfigurer formConfigurer = new HttpFormAuthConfigurer();
+
+      String str = credElement.getAttribute("loginUrl");
+      if (StringUtils.isNotBlank(str)) {
+        formConfigurer.setLoginUrl(str.trim());
+      } else {
+        throw new IllegalArgumentException("Must set loginUrl.");
+      }
+      str = credElement.getAttribute("loginFormId");
+      if (StringUtils.isNotBlank(str)) {
+        formConfigurer.setLoginFormId(str.trim());
+      } else {
+        throw new IllegalArgumentException("Must set loginFormId.");
+      }
+      str = credElement.getAttribute("loginRedirect");
+      if (StringUtils.isNotBlank(str)) {
+        formConfigurer.setLoginRedirect(Boolean.parseBoolean(str));
+      }
+
+      NodeList nodeList = credElement.getChildNodes();
+      for (int j = 0; j < nodeList.getLength(); j++) {
+        Node node = nodeList.item(j);
+        if (!(node instanceof Element))
+          continue;
+
+        Element element = (Element) node;
+        if ("loginPostData".equals(element.getTagName())) {
+          Map<String, String> loginPostData = new HashMap<String, String>();
+          NodeList childNodes = element.getChildNodes();
+          for (int k = 0; k < childNodes.getLength(); k++) {
+            Node fieldNode = childNodes.item(k);
+            if (!(fieldNode instanceof Element))
+              continue;
+
+            Element fieldElement = (Element) fieldNode;
+            String name = fieldElement.getAttribute("name");
+            String value = fieldElement.getAttribute("value");
+            loginPostData.put(name, value);
+          }
+          formConfigurer.setLoginPostData(loginPostData);
+        } else if ("additionalPostHeaders".equals(element.getTagName())) {
+          Map<String, String> additionalPostHeaders = new HashMap<String, String>();
+          NodeList childNodes = element.getChildNodes();
+          for (int k = 0; k < childNodes.getLength(); k++) {
+            Node fieldNode = childNodes.item(k);
+            if (!(fieldNode instanceof Element))
+              continue;
+
+            Element fieldElement = (Element) fieldNode;
+            String name = fieldElement.getAttribute("name");
+            String value = fieldElement.getAttribute("value");
+            additionalPostHeaders.put(name, value);
+          }
+          formConfigurer.setAdditionalPostHeaders(additionalPostHeaders);
+        } else if ("removedFormFields".equals(element.getTagName())) {
+          Set<String> removedFormFields = new HashSet<String>();
+          NodeList childNodes = element.getChildNodes();
+          for (int k = 0; k < childNodes.getLength(); k++) {
+            Node fieldNode = childNodes.item(k);
+            if (!(fieldNode instanceof Element))
+              continue;
+
+            Element fieldElement = (Element) fieldNode;
+            String name = fieldElement.getAttribute("name");
+            removedFormFields.add(name);
+          }
+          formConfigurer.setRemovedFormFields(removedFormFields);
+        }
+      }
+
+      return formConfigurer;
+    } else {
+      throw new IllegalArgumentException("Unsupported authMethod: "
+          + authMethod);
+    }
+  }
+
+  /**
+   * If credentials for the authentication scope determined from the specified
+   * <code>url</code> is not already set in the HTTP client, then this method
+   * sets the default credentials to fetch the specified <code>url</code>. If
+   * credentials are found for the authentication scope, the method returns
+   * without altering the client.
+   * 
+   * @param url
+   *          URL to be fetched
+   */
+  private void resolveCredentials(URL url) {
+
+    if (formConfigurer != null) {
+      HttpFormAuthentication formAuther = new HttpFormAuthentication(
+          formConfigurer, client, this);
+      try {
+        formAuther.login();
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+
+      return;
+    }
+
+    if (defaultUsername != null && defaultUsername.length() > 0) {
+
+      int port = url.getPort();
+      if (port == -1) {
+        if ("https".equals(url.getProtocol()))
+          port = 443;
+        else
+          port = 80;
+      }
+
+      AuthScope scope = new AuthScope(url.getHost(), port);
+
+      if (client.getState().getCredentials(scope) != null) {
+        if (LOG.isTraceEnabled())
+          LOG.trace("Pre-configured credentials with scope - host: "
+              + url.getHost() + "; port: " + port + "; found for url: " + url);
+
+        // Credentials are already configured, so do nothing and return
+        return;
+      }
+
+      if (LOG.isTraceEnabled())
+        LOG.trace("Pre-configured credentials with scope -  host: "
+            + url.getHost() + "; port: " + port + "; not found for url: " + url);
+
+      AuthScope serverAuthScope = getAuthScope(url.getHost(), port,
+          defaultRealm, defaultScheme);
+
+      NTCredentials serverCredentials = new NTCredentials(defaultUsername,
+          defaultPassword, agentHost, defaultRealm);
+
+      client.getState().setCredentials(serverAuthScope, serverCredentials);
+    }
+  }
+
+  /**
+   * Returns an authentication scope for the specified <code>host</code>,
+   * <code>port</code>, <code>realm</code> and <code>scheme</code>.
+   * 
+   * @param host
+   *          Host name or address.
+   * @param port
+   *          Port number.
+   * @param realm
+   *          Authentication realm.
+   * @param scheme
+   *          Authentication scheme.
+   */
+  private static AuthScope getAuthScope(String host, int port, String realm,
+      String scheme) {
+
+    if (host.length() == 0)
+      host = null;
+
+    if (port < 0)
+      port = -1;
+
+    if (realm.length() == 0)
+      realm = null;
+
+    if (scheme.length() == 0)
+      scheme = null;
+
+    return new AuthScope(host, port, realm, scheme);
+  }
+
+  /**
+   * Returns an authentication scope for the specified <code>host</code>,
+   * <code>port</code> and <code>realm</code>.
+   * 
+   * @param host
+   *          Host name or address.
+   * @param port
+   *          Port number.
+   * @param realm
+   *          Authentication realm.
+   */
+  private static AuthScope getAuthScope(String host, int port, String realm) {
+
+    return getAuthScope(host, port, realm, "");
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
new file mode 100644
index 0000000..54dc905
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+import java.util.List;
+
+/**
+ * The base level of services required for Http Authentication
+ * 
+ * @see HttpAuthenticationFactory
+ * 
+ * @author Matt Tencati
+ */
+public interface HttpAuthentication {
+
+  /**
+   * Gets the credentials generated by the HttpAuthentication object. May return
+   * null.
+   * 
+   * @return The credentials value
+   */
+  public List<String> getCredentials();
+
+  /**
+   * Gets the realm used by the HttpAuthentication object during creation.
+   * 
+   * @return The realm value
+   */
+  public String getRealm();
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
new file mode 100644
index 0000000..daff5ec
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+/**
+ * Can be used to identify problems during creation of Authentication objects.
+ * In the future it may be used as a method of collecting authentication
+ * failures during Http protocol transfer in order to present the user with
+ * credentials required during a future fetch.
+ * 
+ * @author Matt Tencati
+ */
+public class HttpAuthenticationException extends Exception {
+
+  /**
+   * Constructs a new exception with null as its detail message.
+   */
+  public HttpAuthenticationException() {
+    super();
+  }
+
+  /**
+   * Constructs a new exception with the specified detail message.
+   * 
+   * @param message
+   *          the detail message. The detail message is saved for later
+   *          retrieval by the {@link Throwable#getMessage()} method.
+   */
+  public HttpAuthenticationException(String message) {
+    super(message);
+  }
+
+  /**
+   * Constructs a new exception with the specified message and cause.
+   * 
+   * @param message
+   *          the detail message. The detail message is saved for later
+   *          retrieval by the {@link Throwable#getMessage()} method.
+   * @param cause
+   *          the cause (use {@link #getCause()} to retrieve the cause)
+   */
+  public HttpAuthenticationException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  /**
+   * Constructs a new exception with the specified cause and detail message from
+   * given clause if it is not null.
+   * 
+   * @param cause
+   *          the cause (use {@link #getCause()} to retrieve the cause)
+   */
+  public HttpAuthenticationException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
new file mode 100644
index 0000000..064a6d0
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.Collection;
+
+// Slf4j Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * Provides the Http protocol implementation with the ability to authenticate
+ * when prompted. The goal is to provide multiple authentication types but for
+ * now just the {@link HttpBasicAuthentication} authentication type is provided.
+ * 
+ * @see HttpBasicAuthentication
+ * @see Http
+ * @see HttpResponse
+ * 
+ * @author Matt Tencati
+ */
+public class HttpAuthenticationFactory implements Configurable {
+
+  /**
+   * The HTTP Authentication (WWW-Authenticate) header which is returned by a
+   * webserver requiring authentication.
+   */
+  public static final String WWW_AUTHENTICATE = "WWW-Authenticate";
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(HttpAuthenticationFactory.class);
+
+  private Configuration conf = null;
+
+  public HttpAuthenticationFactory(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public HttpAuthentication findAuthentication(Metadata header) {
+
+    if (header == null)
+      return null;
+
+    try {
+      Collection<String> challenge = new ArrayList<String>();
+      challenge.add(header.get(WWW_AUTHENTICATE));
+
+      for (String challengeString : challenge) {
+        if (challengeString.equals("NTLM"))
+          challengeString = "Basic realm=techweb";
+
+        if (LOG.isTraceEnabled())
+          LOG.trace("Checking challengeString=" + challengeString);
+
+        HttpAuthentication auth = HttpBasicAuthentication.getAuthentication(
+            challengeString, conf);
+        if (auth != null)
+          return auth;
+
+        // TODO Add additional Authentication lookups here
+      }
+    } catch (Exception e) {
+      LOG.error("Error: ", e);
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
new file mode 100644
index 0000000..0cc2de5
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+// Commons Codec imports
+import org.apache.commons.codec.binary.Base64;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+
+/**
+ * Implementation of RFC 2617 Basic Authentication. Usernames and passwords are
+ * stored in standard Nutch configuration files using the following properties:
+ * http.auth.basic.<realm>.user http.auth.basic.<realm>.pass
+ * 
+ * @author Matt Tencati
+ */
+public class HttpBasicAuthentication implements HttpAuthentication,
+    Configurable {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(HttpBasicAuthentication.class);
+
+  private static Pattern basic = Pattern
+      .compile("[bB][aA][sS][iI][cC] [rR][eE][aA][lL][mM]=\"(\\w*)\"");
+
+  private static Map<String, HttpBasicAuthentication> authMap = new TreeMap<String, HttpBasicAuthentication>();
+
+  private Configuration conf = null;
+  private String challenge = null;
+  private ArrayList<String> credentials = null;
+  private String realm = null;
+
+  /**
+   * Construct an HttpBasicAuthentication for the given challenge parameters.
+   * The challenge parameters are returned by the web server using a
+   * WWW-Authenticate header. This will typically be represented by single line
+   * of the form <code>WWW-Authenticate: Basic realm="myrealm"</code>
+   * 
+   * @param challenge
+   *          WWW-Authenticate header from web server
+   */
+  protected HttpBasicAuthentication(String challenge, Configuration conf)
+      throws HttpAuthenticationException {
+
+    setConf(conf);
+    this.challenge = challenge;
+    credentials = new ArrayList<String>();
+
+    String username = this.conf.get("http.auth.basic." + challenge + ".user");
+    String password = this.conf.get("http.auth.basic." + challenge
+        + ".password");
+
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("BasicAuthentication challenge is " + challenge);
+      LOG.trace("BasicAuthentication username=" + username);
+      LOG.trace("BasicAuthentication password=" + password);
+    }
+
+    if (username == null) {
+      throw new HttpAuthenticationException("Username for " + challenge
+          + " is null");
+    }
+
+    if (password == null) {
+      throw new HttpAuthenticationException("Password for " + challenge
+          + " is null");
+    }
+
+    byte[] credBytes = (username + ":" + password).getBytes();
+    credentials.add("Authorization: Basic "
+        + new String(Base64.encodeBase64(credBytes)));
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Basic credentials: " + credentials);
+    }
+  }
+
+  /*
+   * ---------------------------------- * <implementation:Configurable> *
+   * ----------------------------------
+   */
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    // if (conf.getBoolean("http.auth.verbose", false)) {
+    // LOG.setLevel(Level.FINE);
+    // } else {
+    // LOG.setLevel(Level.WARNING);
+    // }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /*
+   * ---------------------------------- * <implementation:Configurable> *
+   * ----------------------------------
+   */
+
+  /**
+   * Gets the Basic credentials generated by this HttpBasicAuthentication object
+   * 
+   * @return Credentials in the form of
+   *         <code>Authorization: Basic &lt;Base64 encoded userid:password&gt;
+   * 
+   */
+  public List<String> getCredentials() {
+    return credentials;
+  }
+
+  /**
+   * Gets the realm attribute of the HttpBasicAuthentication object. This should
+   * have been supplied to the {@link #getAuthentication(String, Configuration)}
+   * static method
+   * 
+   * @return The realm
+   */
+  public String getRealm() {
+    return realm;
+  }
+
+  /**
+   * This method is responsible for providing Basic authentication information.
+   * The method caches authentication information for each realm so that the
+   * required authentication information does not need to be regenerated for
+   * every request.
+   * 
+   * @param challenge
+   *          The challenge string provided by the webserver. This is the text
+   *          which follows the WWW-Authenticate header, including the Basic
+   *          tag.
+   * @return An HttpBasicAuthentication object or null if unable to generate
+   *         appropriate credentials.
+   */
+  public static HttpBasicAuthentication getAuthentication(String challenge,
+      Configuration conf) {
+    if (challenge == null)
+      return null;
+    Matcher basicMatcher = basic.matcher(challenge);
+    if (basicMatcher.matches()) {
+      String realm = basicMatcher.group(1);
+      Object auth = authMap.get(realm);
+      if (auth == null) {
+        HttpBasicAuthentication newAuth = null;
+        try {
+          newAuth = new HttpBasicAuthentication(realm, conf);
+        } catch (HttpAuthenticationException hae) {
+          if (LOG.isTraceEnabled()) {
+            LOG.trace("HttpBasicAuthentication failed for " + challenge);
+          }
+        }
+        authMap.put(realm, newAuth);
+        return newAuth;
+      } else {
+        return (HttpBasicAuthentication) auth;
+      }
+    }
+    return null;
+  }
+
+  /**
+   * Provides a pattern which can be used by an outside resource to determine if
+   * this class can provide credentials based on simple header information. It
+   * does not calculate any information regarding realms or challenges.
+   * 
+   * @return Returns a Pattern which will match a Basic WWW-Authenticate header.
+   */
+  public static final Pattern getBasicPattern() {
+    return basic;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
new file mode 100644
index 0000000..b713ab6
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+public class HttpFormAuthConfigurer {
+  private String loginUrl;
+  private String loginFormId;
+  /**
+   * The data posted to login form, such as username(or email), password
+   */
+  private Map<String, String> loginPostData;
+  /**
+   * In case we need add additional headers.
+   */
+  private Map<String, String> additionalPostHeaders;
+  /**
+   * If http post login returns redirect code: 301 or 302, 
+   * Http Client will automatically follow the redirect.
+   */
+  private boolean loginRedirect;
+  /**
+   * Used when we need remove some form fields.
+   */
+  private Set<String> removedFormFields;
+
+  public HttpFormAuthConfigurer() {
+  }
+
+  public String getLoginUrl() {
+    return loginUrl;
+  }
+
+  public HttpFormAuthConfigurer setLoginUrl(String loginUrl) {
+    this.loginUrl = loginUrl;
+    return this;
+  }
+
+  public String getLoginFormId() {
+    return loginFormId;
+  }
+
+  public HttpFormAuthConfigurer setLoginFormId(String loginForm) {
+    this.loginFormId = loginForm;
+    return this;
+  }
+
+  public Map<String, String> getLoginPostData() {
+    return loginPostData == null ? new HashMap<String, String>()
+        : loginPostData;
+  }
+
+  public HttpFormAuthConfigurer setLoginPostData(
+      Map<String, String> loginPostData) {
+    this.loginPostData = loginPostData;
+    return this;
+  }
+
+  public Map<String, String> getAdditionalPostHeaders() {
+    return additionalPostHeaders == null ? new HashMap<String, String>()
+        : additionalPostHeaders;
+  }
+
+  public HttpFormAuthConfigurer setAdditionalPostHeaders(
+      Map<String, String> additionalPostHeaders) {
+    this.additionalPostHeaders = additionalPostHeaders;
+    return this;
+  }
+
+  public boolean isLoginRedirect() {
+    return loginRedirect;
+  }
+
+  public HttpFormAuthConfigurer setLoginRedirect(boolean redirect) {
+    this.loginRedirect = redirect;
+    return this;
+  }
+
+  public Set<String> getRemovedFormFields() {
+    return removedFormFields == null ? new HashSet<String>()
+        : removedFormFields;
+  }
+
+  public HttpFormAuthConfigurer setRemovedFormFields(
+      Set<String> removedFormFields) {
+    this.removedFormFields = removedFormFields;
+    return this; }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
new file mode 100644
index 0000000..4c73f50
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
@@ -0,0 +1,223 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.CookieHandler;
+import java.net.CookieManager;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.httpclient.methods.PostMethod;
+import org.apache.commons.io.IOUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class HttpFormAuthentication {
+  private static final Logger LOGGER = LoggerFactory
+      .getLogger(HttpFormAuthentication.class);
+  private static Map<String, String> defaultLoginHeaders = new HashMap<String, String>();
+
+  static {
+    defaultLoginHeaders.put("User-Agent", "Mozilla/5.0");
+    defaultLoginHeaders
+    .put("Accept",
+        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
+    defaultLoginHeaders.put("Accept-Language", "en-US,en;q=0.5");
+    defaultLoginHeaders.put("Connection", "keep-alive");
+    defaultLoginHeaders.put("Content-Type",
+        "application/x-www-form-urlencoded");
+  }
+
+  private HttpClient client;
+  private HttpFormAuthConfigurer authConfigurer = new HttpFormAuthConfigurer();
+  private String cookies;
+
+  public HttpFormAuthentication(HttpFormAuthConfigurer authConfigurer,
+      HttpClient client, Http http) {
+    this.authConfigurer = authConfigurer;
+    this.client = client;
+    defaultLoginHeaders.put("Accept", http.getAccept());
+    defaultLoginHeaders.put("Accept-Language", http.getAcceptLanguage());
+    defaultLoginHeaders.put("User-Agent", http.getUserAgent());
+  }
+
+  public HttpFormAuthentication(String loginUrl, String loginForm,
+      Map<String, String> loginPostData,
+      Map<String, String> additionalPostHeaders,
+      Set<String> removedFormFields) {
+    this.authConfigurer.setLoginUrl(loginUrl);
+    this.authConfigurer.setLoginFormId(loginForm);
+    this.authConfigurer
+    .setLoginPostData(loginPostData == null ? new HashMap<String, String>()
+        : loginPostData);
+    this.authConfigurer
+    .setAdditionalPostHeaders(additionalPostHeaders == null ? new HashMap<String, String>()
+        : additionalPostHeaders);
+    this.authConfigurer
+    .setRemovedFormFields(removedFormFields == null ? new HashSet<String>()
+        : removedFormFields);
+    this.client = new HttpClient();
+  }
+
+  public void login() throws Exception {
+    // make sure cookies are turned on
+    CookieHandler.setDefault(new CookieManager());
+    String pageContent = httpGetPageContent(authConfigurer.getLoginUrl());
+    List<NameValuePair> params = getLoginFormParams(pageContent);
+    sendPost(authConfigurer.getLoginUrl(), params);
+  }
+
+  private void sendPost(String url, List<NameValuePair> params)
+      throws Exception {
+    PostMethod post = null;
+    try {
+      if (authConfigurer.isLoginRedirect()) {
+        post = new PostMethod(url) {
+          @Override
+          public boolean getFollowRedirects() {
+            return true;
+          }
+        };
+      } else {
+        post = new PostMethod(url);
+      }
+      // we can't use post.setFollowRedirects(true) as it will throw
+      // IllegalArgumentException:
+      // Entity enclosing requests cannot be redirected without user
+      // intervention
+      setLoginHeader(post);
+      post.addParameters(params.toArray(new NameValuePair[0]));
+      int rspCode = client.executeMethod(post);
+      if (LOGGER.isDebugEnabled()) {
+        LOGGER.debug("rspCode: " + rspCode);
+        LOGGER.debug("\nSending 'POST' request to URL : " + url);
+
+        LOGGER.debug("Post parameters : " + params);
+        LOGGER.debug("Response Code : " + rspCode);
+        for (Header header : post.getRequestHeaders()) {
+          LOGGER.debug("Response headers : " + header);
+        }
+      }
+      String rst = IOUtils.toString(post.getResponseBodyAsStream());
+      LOGGER.debug("login post result: " + rst);
+    } finally {
+      if (post != null) {
+        post.releaseConnection();
+      }
+    }
+  }
+
+  private void setLoginHeader(PostMethod post) {
+    Map<String, String> headers = new HashMap<String, String>();
+    headers.putAll(defaultLoginHeaders);
+    // additionalPostHeaders can overwrite value in defaultLoginHeaders
+    headers.putAll(authConfigurer.getAdditionalPostHeaders());
+    for (Entry<String, String> entry : headers.entrySet()) {
+      post.addRequestHeader(entry.getKey(), entry.getValue());
+    }
+    post.addRequestHeader("Cookie", getCookies());
+  }
+
+  private String httpGetPageContent(String url) throws IOException {
+
+    GetMethod get = new GetMethod(url);
+    try {
+      for (Entry<String, String> entry : authConfigurer
+          .getAdditionalPostHeaders().entrySet()) {
+        get.addRequestHeader(entry.getKey(), entry.getValue());
+      }
+      client.executeMethod(get);
+      Header cookieHeader = get.getResponseHeader("Set-Cookie");
+      if (cookieHeader != null) {
+        setCookies(cookieHeader.getValue());
+      }
+      String rst = IOUtils.toString(get.getResponseBodyAsStream());
+      return rst;
+    } finally {
+      get.releaseConnection();
+    }
+
+  }
+
+  private List<NameValuePair> getLoginFormParams(String pageContent)
+      throws UnsupportedEncodingException {
+    List<NameValuePair> params = new ArrayList<NameValuePair>();
+    Document doc = Jsoup.parse(pageContent);
+    Element loginform = doc.getElementById(authConfigurer.getLoginFormId());
+    if (loginform == null) {
+      LOGGER.debug("No form element found with 'id' = {}, trying 'name'.",
+          authConfigurer.getLoginFormId());
+      loginform = doc.select("form[name="+ authConfigurer.getLoginFormId() + "]").first();
+      if (loginform == null) {
+        LOGGER.debug("No form element found with 'name' = {}",
+            authConfigurer.getLoginFormId());
+        throw new IllegalArgumentException("No form exists: "
+            + authConfigurer.getLoginFormId());
+      }
+    }
+    Elements inputElements = loginform.getElementsByTag("input");
+    // skip fields in removedFormFields or loginPostData
+    for (Element inputElement : inputElements) {
+      String key = inputElement.attr("name");
+      String value = inputElement.attr("value");
+      if (authConfigurer.getLoginPostData().containsKey(key)
+          || authConfigurer.getRemovedFormFields().contains(key)) {
+        // value = loginPostData.get(key);
+        continue;
+      }
+      params.add(new NameValuePair(key, value));
+    }
+    // add key and value in loginPostData
+    for (Entry<String, String> entry : authConfigurer.getLoginPostData()
+        .entrySet()) {
+      params.add(new NameValuePair(entry.getKey(), entry.getValue()));
+    }
+    return params;
+  }
+
+  public String getCookies() {
+    return cookies;
+  }
+
+  public void setCookies(String cookies) {
+    this.cookies = cookies;
+  }
+
+  public boolean isRedirect() {
+    return authConfigurer.isLoginRedirect();
+  }
+
+  public void setRedirect(boolean redirect) {
+    this.authConfigurer.setLoginRedirect(redirect);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
new file mode 100644
index 0000000..f074af2
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
@@ -0,0 +1,216 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+// JDK imports
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+// HTTP Client imports
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpVersion;
+import org.apache.commons.httpclient.cookie.CookiePolicy;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.httpclient.params.HttpMethodParams;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpClient;
+
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.http.api.HttpBase;
+
+/**
+ * An HTTP response.
+ * 
+ * @author Susam Pal
+ */
+public class HttpResponse implements Response {
+
+  private URL url;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+
+  /**
+   * Fetches the given <code>url</code> and prepares HTTP response.
+   * 
+   * @param http
+   *          An instance of the implementation class of this plugin
+   * @param url
+   *          URL to be fetched
+   * @param datum
+   *          Crawl data
+   * @param followRedirects
+   *          Whether to follow redirects; follows redirect if and only if this
+   *          is true
+   * @return HTTP response
+   * @throws IOException
+   *           When an error occurs
+   */
+  HttpResponse(Http http, URL url, CrawlDatum datum, boolean followRedirects)
+      throws IOException {
+
+    // Prepare GET method for HTTP request
+    this.url = url;
+    GetMethod get = new GetMethod(url.toString());
+    get.setFollowRedirects(followRedirects);
+    get.setDoAuthentication(true);
+    if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
+      get.setRequestHeader("If-Modified-Since",
+          HttpDateFormat.toString(datum.getModifiedTime()));
+    }
+
+    // Set HTTP parameters
+    HttpMethodParams params = get.getParams();
+    if (http.getUseHttp11()) {
+      params.setVersion(HttpVersion.HTTP_1_1);
+    } else {
+      params.setVersion(HttpVersion.HTTP_1_0);
+    }
+    params.makeLenient();
+    params.setContentCharset("UTF-8");
+    params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
+    params.setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
+    // XXX (ab) not sure about this... the default is to retry 3 times; if
+    // XXX the request body was sent the method is not retried, so there is
+    // XXX little danger in retrying...
+    // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
+    try {
+      HttpClient client = Http.getClient();
+      client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941
+      code = client.executeMethod(get);
+
+      Header[] heads = get.getResponseHeaders();
+
+      for (int i = 0; i < heads.length; i++) {
+        headers.set(heads[i].getName(), heads[i].getValue());
+      }
+
+      // Limit download size
+      int contentLength = Integer.MAX_VALUE;
+      String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+      if (contentLengthString != null) {
+        try {
+          contentLength = Integer.parseInt(contentLengthString.trim());
+        } catch (NumberFormatException ex) {
+          throw new HttpException("bad content length: " + contentLengthString);
+        }
+      }
+      if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+        contentLength = http.getMaxContent();
+      }
+
+      // always read content. Sometimes content is useful to find a cause
+      // for error.
+      InputStream in = get.getResponseBodyAsStream();
+      try {
+        byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+        int bufferFilled = 0;
+        int totalRead = 0;
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+            && totalRead + bufferFilled <= contentLength) {
+          totalRead += bufferFilled;
+          out.write(buffer, 0, bufferFilled);
+        }
+
+        content = out.toByteArray();
+      } catch (Exception e) {
+        if (code == 200)
+          throw new IOException(e.toString());
+        // for codes other than 200 OK, we are fine with empty content
+      } finally {
+        if (in != null) {
+          in.close();
+        }
+        get.abort();
+      }
+
+      StringBuilder fetchTrace = null;
+      if (Http.LOG.isTraceEnabled()) {
+        // Trace message
+        fetchTrace = new StringBuilder("url: " + url + "; status code: " + code
+            + "; bytes received: " + content.length);
+        if (getHeader(Response.CONTENT_LENGTH) != null)
+          fetchTrace.append("; Content-Length: "
+              + getHeader(Response.CONTENT_LENGTH));
+        if (getHeader(Response.LOCATION) != null)
+          fetchTrace.append("; Location: " + getHeader(Response.LOCATION));
+      }
+      // Extract gzip, x-gzip and deflate content
+      if (content != null) {
+        // check if we have to uncompress it
+        String contentEncoding = headers.get(Response.CONTENT_ENCODING);
+        if (contentEncoding != null && Http.LOG.isTraceEnabled())
+          fetchTrace.append("; Content-Encoding: " + contentEncoding);
+        if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
+          content = http.processGzipEncoded(content, url);
+          if (Http.LOG.isTraceEnabled())
+            fetchTrace.append("; extracted to " + content.length + " bytes");
+        } else if ("deflate".equals(contentEncoding)) {
+          content = http.processDeflateEncoded(content, url);
+          if (Http.LOG.isTraceEnabled())
+            fetchTrace.append("; extracted to " + content.length + " bytes");
+        }
+      }
+
+      // Logger trace message
+      if (Http.LOG.isTraceEnabled()) {
+        Http.LOG.trace(fetchTrace.toString());
+      }
+    } finally {
+      get.releaseConnection();
+    }
+  }
+
+  /*
+   * ------------------------- * <implementation:Response> *
+   * -------------------------
+   */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /*
+   * -------------------------- * </implementation:Response> *
+   * --------------------------
+   */
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/package.html b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/package.html
new file mode 100644
index 0000000..9cbcb14
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/package.html
@@ -0,0 +1,9 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the HTTP and
+HTTPS protocols, optionally with Basic, Digest and NTLM authentication
+schemes for web server as well as proxy server. It handles cookies
+within a single fetch operation. This plugin is based on Jakarta
+Commons HttpClient library.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/test/conf/httpclient-auth-test.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/test/conf/httpclient-auth-test.xml b/nutch-plugins/protocol-httpclient/src/test/conf/httpclient-auth-test.xml
new file mode 100644
index 0000000..3c0203b
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/test/conf/httpclient-auth-test.xml
@@ -0,0 +1,58 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<auth-configuration>
+
+  <!-- Default credentials -->
+  <credentials username="userx" password="passx">
+    <default/>
+    <authscope host="127.0.0.1" port="47500"/>
+  </credentials>
+
+  <!-- Defined a realm for 127.0.0.1:47501 so that authentication for
+       other realms fail (except another realm for 127.0.0.1:47501 is
+       defined below for NTLM scheme). -->
+  <credentials username="userx" password="passx">
+    <authscope host="127.0.0.1" port="47501" realm="realmx"
+    scheme="BASIC"/>
+  </credentials>
+
+  <!-- Test case for NTLM authentication scheme. -->
+  <credentials username="ntlm_user" password="ntlm_pass">
+    <authscope host="127.0.0.1" port="47501" realm="NUTCH"
+    scheme="NTLM"/>
+  </credentials>
+
+  <!-- Test case for credentials selection based on scheme (realm1 is
+       present in basic.jsp as well as digest.jsp).
+       Also tests Digest authentication scheme. -->
+  <credentials username="digest_user" password="digest_pass">
+    <authscope host="127.0.0.1" port="47500" realm="realm1"
+    scheme="DIGEST"/>
+  </credentials>
+
+  <!-- Test case for Basic authentication scheme. -->
+  <credentials username="user1" password="pass1">
+    <authscope host="127.0.0.1" port="47500" realm="realm1"/>
+  </credentials>
+  <credentials username="user2" password="pass2">
+    <authscope host="127.0.0.1" port="47500" realm="realm2"/>
+  </credentials>
+
+</auth-configuration>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/test/conf/nutch-site-test.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/test/conf/nutch-site-test.xml b/nutch-plugins/protocol-httpclient/src/test/conf/nutch-site-test.xml
new file mode 100644
index 0000000..856ea15
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/test/conf/nutch-site-test.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<configuration>
+
+<property>
+  <name>http.robots.agents</name>
+  <value>Nutch-Test,*</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.agent.name</name>
+  <value>Nutch-Test</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.agent.description</name>
+  <value>Nutch protocol-httpclient test</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.auth.file</name>
+  <value>httpclient-auth-test.xml</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.timeout</name>
+  <value>60000</value>
+  <description></description>
+</property>
+
+</configuration>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/test/java/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/test/java/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java b/nutch-plugins/protocol-httpclient/src/test/java/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
new file mode 100644
index 0000000..783e5af
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/test/java/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.httpclient;
+
+import java.net.URL;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.bio.SocketConnector;
+import org.mortbay.jetty.handler.ContextHandler;
+import org.mortbay.jetty.servlet.ServletHandler;
+import org.mortbay.jetty.servlet.SessionHandler;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+
+/**
+ * Test cases for protocol-httpclient.
+ */
+public class TestProtocolHttpClient {
+
+  private Server server;
+  private Configuration conf;
+  private static final String RES_DIR = System.getProperty("test.data", ".");
+  private int port;
+  private Http http = new Http();
+
+  @Before
+  public void setUp() throws Exception {
+
+    ContextHandler context = new ContextHandler();
+    context.setContextPath("/");
+    context.setResourceBase(RES_DIR);
+    ServletHandler sh = new ServletHandler();
+    sh.addServletWithMapping("org.apache.jasper.servlet.JspServlet", "*.jsp");
+    context.addHandler(sh);
+    context.addHandler(new SessionHandler());
+
+    server = new Server();
+    server.addHandler(context);
+
+    conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("nutch-site-test.xml");
+
+    http = new Http();
+    http.setConf(conf);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    server.stop();
+    for (int i = 0; i < 5; i++) {
+      if (!server.isStopped()) {
+       Thread.sleep(1000);
+      }
+    }
+  }
+
+  /**
+   * Tests whether the client can remember cookies.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testCookies() throws Exception {
+    startServer(47500);
+    fetchPage("/cookies.jsp", 200);
+    fetchPage("/cookies.jsp?cookie=yes", 200);
+  }
+
+  /**
+   * Tests that no pre-emptive authorization headers are sent by the client.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testNoPreemptiveAuth() throws Exception {
+    startServer(47500);
+    fetchPage("/noauth.jsp", 200);
+  }
+
+  /**
+   * Tests default credentials.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testDefaultCredentials() throws Exception {
+    startServer(47502);
+    fetchPage("/basic.jsp", 200);
+  }
+
+  /**
+   * Tests basic authentication scheme for various realms.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testBasicAuth() throws Exception {
+    startServer(47500);
+    fetchPage("/basic.jsp", 200);
+    fetchPage("/basic.jsp?case=1", 200);
+    fetchPage("/basic.jsp?case=2", 200);
+    server.start();
+  }
+
+  /**
+   * Tests that authentication happens for a defined realm and not for other
+   * realms for a host:port when an extra <code>authscope</code> tag is not
+   * defined to match all other realms.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testOtherRealmsNoAuth() throws Exception {
+    startServer(47501);
+    fetchPage("/basic.jsp", 200);
+    fetchPage("/basic.jsp?case=1", 401);
+    fetchPage("/basic.jsp?case=2", 401);
+  }
+
+  /**
+   * Tests Digest authentication scheme.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testDigestAuth() throws Exception {
+    startServer(47500);
+    fetchPage("/digest.jsp", 200);
+  }
+
+  /**
+   * Tests NTLM authentication scheme.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testNtlmAuth() throws Exception {
+    startServer(47501);
+    fetchPage("/ntlm.jsp", 200);
+  }
+
+  /**
+   * Starts the Jetty server at a specified port.
+   *
+   * Will try up to 10 ports to find an available port to use.
+   *
+   * @param portno
+   *          Port number.
+   * @throws Exception
+   *           When an error occurs.
+   */
+  private void startServer(int portno) throws Exception {
+    SocketConnector listener = new SocketConnector();
+    listener.setHost("127.0.0.1");
+    server.addConnector(listener);
+    for (int p = portno; p < portno + 10; p++) {
+      port = portno;
+      listener.setPort(port);
+      try {
+        server.start();
+        break;
+      } catch (Exception e) {
+        if (p == portno + 9) {
+          throw e;
+        }
+      }
+    }
+  }
+
+  /**
+   * Fetches the specified <code>page</code> from the local Jetty server and
+   * checks whether the HTTP response status code matches with the expected
+   * code.
+   * 
+   * @param page
+   *          Page to be fetched.
+   * @param expectedCode
+   *          HTTP response status code expected while fetching the page.
+   * @throws Exception
+   *           When an error occurs or test case fails.
+   */
+  private void fetchPage(String page, int expectedCode) throws Exception {
+    URL url = new URL("http", "127.0.0.1", port, page);
+    Response response = null;
+    response = http.getResponse(url, new CrawlDatum(), true);
+
+    int code = response.getCode();
+    Assert.assertEquals("HTTP Status Code for " + url, expectedCode, code);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/README.md
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/README.md b/nutch-plugins/protocol-interactiveselenium/README.md
new file mode 100644
index 0000000..dd43ee7
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/README.md
@@ -0,0 +1,38 @@
+Nutch Interactive Selenium
+==========================
+
+This protocol plugin allows you to fetch and interact with pages using [Selenium](http://www.seleniumhq.org/).
+
+# Dependencies and Configuration
+
+You will need to have [Selenium](http://www.seleniumhq.org/) and a compatible version of Firefox installed to use this plugin.
+
+Set the protocol to be used in your Nutch configuration files.
+```
+<!-- NUTCH_HOME/conf/nutch-site.xml -->
+
+<configuration>
+  ...
+  <property>
+    <name>plugin.includes</name>
+    <value>protocol-interactiveselenium|urlfilter-regex| ... </value>
+    <description></description>
+  </property>
+```
+
+# Custom Handlers
+
+Only basic functionality is included in the DefaultHandler that comes with the plugin. If you want additional functionality you can implement custom handlers by implementing the InteractiveSeleniumHandler interface in the plugin package. Be sure to also update the plugin config to include your new handler.
+
+```
+<!-- NUTCH_HOME/conf/nutch-site.xml -->
+<property>
+  <name>interactiveselenium.handlers</name>
+  <value>NewCustomHandler,DefaultHandler</value>
+  <description></description>
+</property>
+```
+
+# Handler Info
+
+Handlers are called in the order that they're specified in the configuration. A "clean" driver is used for each handler so multiple handlers won't interfere with each other. Page content is appended together from each handler and returned for the request.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/build-ivy.xml b/nutch-plugins/protocol-interactiveselenium/build-ivy.xml
new file mode 100644
index 0000000..9f96619
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-interactiveselenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/build.xml b/nutch-plugins/protocol-interactiveselenium/build.xml
new file mode 100644
index 0000000..69dab90
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/build.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-interactiveselenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+    <ant target="jar" inheritall="false" dir="../lib-selenium"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+      <include name="**/lib-selenium/*.jar" />
+      <include name="**/protocol-selenium/*.jar" />
+    </fileset>
+  </path>
+
+</project>