You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:23 UTC

[07/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
new file mode 100644
index 0000000..8b1a031
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -0,0 +1,573 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.net.ssl.SSLSocket;
+import javax.net.ssl.SSLSocketFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.http.api.HttpException;
+
+/**
+ * An HTTP response.
+ */
+public class HttpResponse implements Response {
+
+  private Configuration conf;
+  private HttpBase http;
+  private URL url;
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+  // used for storing the http headers verbatim
+  private StringBuffer httpHeaders;
+
+  protected enum Scheme {
+    HTTP, HTTPS,
+  }
+
+  /**
+   * Default public constructor.
+   *
+   * @param http
+   * @param url
+   * @param datum
+   * @throws ProtocolException
+   * @throws IOException
+   */
+  public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
+      throws ProtocolException, IOException {
+
+    this.http = http;
+    this.url = url;
+    this.orig = url.toString();
+    this.base = url.toString();
+
+    Scheme scheme = null;
+
+    if ("http".equals(url.getProtocol())) {
+      scheme = Scheme.HTTP;
+    } else if ("https".equals(url.getProtocol())) {
+      scheme = Scheme.HTTPS;
+    } else {
+      throw new HttpException("Unknown scheme (not http/https) for url:" + url);
+    }
+
+    if (Http.LOG.isTraceEnabled()) {
+      Http.LOG.trace("fetching " + url);
+    }
+
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      if (scheme == Scheme.HTTP) {
+        port = 80;
+      } else {
+        port = 443;
+      }
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      if (scheme == Scheme.HTTPS) {
+        SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
+            .getDefault();
+        SSLSocket sslsocket = (SSLSocket) factory
+            .createSocket(socket, sockHost, sockPort, true);
+        sslsocket.setUseClientMode(true);
+
+        // Get the protocols and ciphers supported by this JVM
+        Set<String> protocols = new HashSet<String>(
+            Arrays.asList(sslsocket.getSupportedProtocols()));
+        Set<String> ciphers = new HashSet<String>(
+            Arrays.asList(sslsocket.getSupportedCipherSuites()));
+
+        // Intersect with preferred protocols and ciphers
+        protocols.retainAll(http.getTlsPreferredProtocols());
+        ciphers.retainAll(http.getTlsPreferredCipherSuites());
+
+        sslsocket.setEnabledProtocols(
+            protocols.toArray(new String[protocols.size()]));
+        sslsocket.setEnabledCipherSuites(
+            ciphers.toArray(new String[ciphers.size()]));
+
+        sslsocket.startHandshake();
+        socket = sslsocket;
+      }
+
+      this.conf = http.getConf();
+      if (sockAddr != null
+          && conf.getBoolean("store.ip.address", false) == true) {
+        headers.add("_ip_", sockAddr.getAddress().getHostAddress());
+      }
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy(url)) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + HttpDateFormat
+            .toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      // store the request in the metadata?
+      if (conf.getBoolean("store.http.request", false) == true) {
+        headers.add("_request_", reqStr.toString());
+      }
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(
+              new BufferedInputStream(socket.getInputStream(),
+                  Http.BUFFER_SIZE), Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      // store the http headers verbatim
+      if (conf.getBoolean("store.http.headers", false) == true) {
+        httpHeaders = new StringBuffer();
+      }
+
+      headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        if (httpHeaders != null)
+          httpHeaders.append(line).append("\n");
+        // parse headers
+        parseHeaders(in, line, httpHeaders);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      // Get Content type header
+      String contentType = getHeader(Response.CONTENT_TYPE);
+
+      // handle with HtmlUnit only if content type in HTML or XHTML 
+      if (contentType != null) {
+        if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+          readContentFromHtmlUnit(url);
+        } else {
+          String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
+          if (transferEncoding != null && "chunked"
+              .equalsIgnoreCase(transferEncoding.trim())) {
+            readChunkedContent(in, line);
+          } else {
+            readPlainContent(in);
+          }
+
+          String contentEncoding = getHeader(Response.CONTENT_ENCODING);
+          if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
+            content = http.processGzipEncoded(content, url);
+          } else if ("deflate".equals(contentEncoding)) {
+            content = http.processDeflateEncoded(content, url);
+          } else {
+            // store the headers verbatim only if the response was not compressed
+            // as the content length reported with not match otherwise
+            if (httpHeaders != null) {
+              headers.add("_response.headers_", httpHeaders.toString());
+            }
+            if (Http.LOG.isTraceEnabled()) {
+              Http.LOG.trace("fetched " + content.length + " bytes from " + url);
+            }
+          }
+        }
+      }
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+
+  }
+
+  /*
+   * ------------------------- * <implementation:Response> *
+   * -------------------------
+   */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /*
+   * ------------------------- * <implementation:Response> *
+   * -------------------------
+   */
+
+  private void readContentFromHtmlUnit(URL url) throws IOException {
+    String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf);
+    content = page.getBytes("UTF-8");
+  }
+  
+  private void readPlainContent(InputStream in)
+      throws HttpException, IOException {
+
+    int contentLength = Integer.MAX_VALUE; // get content length
+    String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+    if (contentLengthString != null) {
+      contentLengthString = contentLengthString.trim();
+      try {
+        if (!contentLengthString.isEmpty())
+          contentLength = Integer.parseInt(contentLengthString);
+      } catch (NumberFormatException e) {
+        throw new HttpException("bad content length: " + contentLengthString);
+      }
+    }
+    if (http.getMaxContent() >= 0 && contentLength > http
+        .getMaxContent()) // limit
+      // download
+      // size
+      contentLength = http.getMaxContent();
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
+    byte[] bytes = new byte[Http.BUFFER_SIZE];
+    int length = 0;
+
+    // do not try to read if the contentLength is 0
+    if (contentLength == 0) {
+      content = new byte[0];
+      return;
+    }
+
+    // read content
+    int i = in.read(bytes);
+    while (i != -1) {
+      out.write(bytes, 0, i);
+      length += i;
+      if (length >= contentLength) {
+        break;
+      }
+      if ((length + Http.BUFFER_SIZE) > contentLength) {
+        // reading next chunk may hit contentLength,
+        // must limit number of bytes read
+        i = in.read(bytes, 0, (contentLength - length));
+      } else {
+        i = in.read(bytes);
+      }
+    }
+    content = out.toByteArray();
+  }
+
+  /**
+   * @param in
+   * @param line
+   * @throws HttpException
+   * @throws IOException
+   */
+  private void readChunkedContent(PushbackInputStream in, StringBuffer line)
+      throws HttpException, IOException {
+    boolean doneChunks = false;
+    int contentBytesRead = 0;
+    byte[] bytes = new byte[Http.BUFFER_SIZE];
+    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
+
+    while (!doneChunks) {
+      if (Http.LOG.isTraceEnabled()) {
+        Http.LOG.trace("Http: starting chunk");
+      }
+
+      readLine(in, line, false);
+
+      String chunkLenStr;
+      // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'");
+      // }
+
+      int pos = line.indexOf(";");
+      if (pos < 0) {
+        chunkLenStr = line.toString();
+      } else {
+        chunkLenStr = line.substring(0, pos);
+        // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " +
+        // line.substring(pos+1)); }
+      }
+      chunkLenStr = chunkLenStr.trim();
+      int chunkLen;
+      try {
+        chunkLen = Integer.parseInt(chunkLenStr, 16);
+      } catch (NumberFormatException e) {
+        throw new HttpException("bad chunk length: " + line.toString());
+      }
+
+      if (chunkLen == 0) {
+        doneChunks = true;
+        break;
+      }
+
+      if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http
+          .getMaxContent())
+        chunkLen = http.getMaxContent() - contentBytesRead;
+
+      // read one chunk
+      int chunkBytesRead = 0;
+      while (chunkBytesRead < chunkLen) {
+
+        int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
+            (chunkLen - chunkBytesRead) :
+            Http.BUFFER_SIZE;
+        int len = in.read(bytes, 0, toRead);
+
+        if (len == -1)
+          throw new HttpException("chunk eof after " + contentBytesRead
+              + " bytes in successful chunks" + " and " + chunkBytesRead
+              + " in current chunk");
+
+        // DANGER!!! Will printed GZIPed stuff right to your
+        // terminal!
+        // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0,
+        // len)); }
+
+        out.write(bytes, 0, len);
+        chunkBytesRead += len;
+      }
+
+      readLine(in, line, false);
+
+    }
+
+    if (!doneChunks) {
+      if (contentBytesRead != http.getMaxContent())
+        throw new HttpException("chunk eof: !doneChunk && didn't max out");
+      return;
+    }
+
+    content = out.toByteArray();
+    parseHeaders(in, line, null);
+
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line)
+      throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException(
+          "bad status line '" + line + "': " + e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line)
+      throws IOException, HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line,
+      StringBuffer httpHeaders) throws IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      if (httpHeaders != null)
+        httpHeaders.append(line).append("\n");
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || (
+          (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html"))
+          != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          // TODO: (CM) We don't know the header names here
+          // since we're just handling them generically. It would
+          // be nice to provide some sort of mapping function here
+          // for the returned header names to the standard metadata
+          // names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line,
+      boolean allowContinuedLine) throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/package.html b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/package.html
new file mode 100644
index 0000000..4181951
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/package.html
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/build.xml b/nutch-plugins/protocol-http/build.xml
new file mode 100755
index 0000000..30720f1
--- /dev/null
+++ b/nutch-plugins/protocol-http/build.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-http" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+    <pathelement location="${build.dir}/test/conf"/>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-http"/>
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <copy toDir="${build.test}">
+      <fileset dir="${src.test}" excludes="**/*.java"/>
+    </copy>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+      <fileset dir="jsp"/>
+   </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/ivy.xml b/nutch-plugins/protocol-http/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/protocol-http/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/jsp/basic-http.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/jsp/basic-http.jsp b/nutch-plugins/protocol-http/jsp/basic-http.jsp
new file mode 100644
index 0000000..bf1f8bd
--- /dev/null
+++ b/nutch-plugins/protocol-http/jsp/basic-http.jsp
@@ -0,0 +1,44 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin  
+--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
+String path = request.getContextPath();
+String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+%>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>HelloWorld</title>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8" />
+    <meta name="Language" content="en" />
+	<meta http-equiv="pragma" content="no-cache">
+	<meta http-equiv="cache-control" content="no-cache">
+	<meta http-equiv="expires" content="0">    
+	<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+	<meta http-equiv="description" content="This is my page">
+	<!--
+	<link rel="stylesheet" type="text/css" href="styles.css">
+	-->
+  </head>
+  
+  <body>
+    Hello World!!! <br>
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/jsp/brokenpage.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/jsp/brokenpage.jsp b/nutch-plugins/protocol-http/jsp/brokenpage.jsp
new file mode 100644
index 0000000..f3f7c4a
--- /dev/null
+++ b/nutch-plugins/protocol-http/jsp/brokenpage.jsp
@@ -0,0 +1,47 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin
+--%>
+
+@ page language="java" import="java.util.*" pageEncoding="UTF-8"
+
+String path = request.getContextPath();
+String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>HelloWorld</title>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8" />
+    <meta name="Language" content="en" />
+	<meta http-equiv="pragma" content="no-cache">
+	<meta http-equiv="cache-control" content="no-cache">
+	<meta http-equiv="expires" content="0">    
+	<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+	<meta http-equiv="description" content="This is my page">
+	<!--
+	<link rel="stylesheet" type="text/css" href="styles.css">
+	-->
+  </head>
+  
+  <body>
+    Hello World!!! <br>
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/jsp/redirect301.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/jsp/redirect301.jsp b/nutch-plugins/protocol-http/jsp/redirect301.jsp
new file mode 100644
index 0000000..1100b89
--- /dev/null
+++ b/nutch-plugins/protocol-http/jsp/redirect301.jsp
@@ -0,0 +1,49 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin
+--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
+String path = request.getContextPath();
+String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+%>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>My JSP page</title>
+    
+	<meta http-equiv="pragma" content="no-cache">
+	<meta http-equiv="cache-control" content="no-cache">
+	<meta http-equiv="expires" content="0">    
+	<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+	<meta http-equiv="description" content="This is my page">
+	<!--
+	<link rel="stylesheet" type="text/css" href="styles.css">
+	-->
+
+  </head>
+  
+  <body>
+       <%
+	response.setStatus(301);
+	response.setHeader( "Location", "http://nutch.apache.org");
+	response.setHeader( "Connection", "close" );
+		%> 
+    You are redirected by JSP<br>
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/jsp/redirect302.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/jsp/redirect302.jsp b/nutch-plugins/protocol-http/jsp/redirect302.jsp
new file mode 100644
index 0000000..8a250d9
--- /dev/null
+++ b/nutch-plugins/protocol-http/jsp/redirect302.jsp
@@ -0,0 +1,49 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin 
+--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
+String path = request.getContextPath();
+String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+%>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>My JSP page</title>
+    
+	<meta http-equiv="pragma" content="no-cache">
+	<meta http-equiv="cache-control" content="no-cache">
+	<meta http-equiv="expires" content="0">    
+	<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+	<meta http-equiv="description" content="This is my page">
+	<!--
+	<link rel="stylesheet" type="text/css" href="styles.css">
+	-->
+
+  </head>
+  
+  <body>
+       <%
+	response.setStatus(302);
+	response.setHeader( "Location", "http://nutch.apache.org");
+	response.setHeader( "Connection", "close" );
+		%> 
+    You are sucessfully redirected by JSP<br>
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/plugin.xml b/nutch-plugins/protocol-http/plugin.xml
new file mode 100755
index 0000000..8770b10
--- /dev/null
+++ b/nutch-plugins/protocol-http/plugin.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-http"
+   name="Http Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-http.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.http"
+              name="HttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.http.Http"
+                      class="org.apache.nutch.protocol.http.Http">
+        <parameter name="protocolName" value="http"/>
+      </implementation>
+      
+      <implementation id="org.apache.nutch.protocol.http.Http"
+                       class="org.apache.nutch.protocol.http.Http">
+           <parameter name="protocolName" value="https"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/pom.xml b/nutch-plugins/protocol-http/pom.xml
new file mode 100644
index 0000000..e7ade28
--- /dev/null
+++ b/nutch-plugins/protocol-http/pom.xml
@@ -0,0 +1,57 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-http</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-http</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-http</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId> org.mortbay.jetty</groupId>
+            <artifactId>jetty</artifactId>
+            <version>6.1.26</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId> org.mortbay.jetty</groupId>
+            <artifactId>jsp-2.1</artifactId>
+            <version>6.1.14</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/Http.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/Http.java b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/Http.java
new file mode 100755
index 0000000..56f9f4f
--- /dev/null
+++ b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/Http.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http;
+
+// JDK imports
+import java.io.IOException;
+import java.net.URL;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  /**
+   * Public default constructor.
+   */
+  public Http() {
+    super(LOG);
+  }
+
+  /**
+   * Set the {@link org.apache.hadoop.conf.Configuration} object.
+   * 
+   * @param conf
+   */
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    // Level logLevel = Level.WARNING;
+    // if (conf.getBoolean("http.verbose", false)) {
+    // logLevel = Level.FINE;
+    // }
+    // LOG.setLevel(logLevel);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/HttpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/HttpResponse.java b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/HttpResponse.java
new file mode 100644
index 0000000..f6d7e4d
--- /dev/null
+++ b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -0,0 +1,558 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.net.ssl.SSLSocket;
+import javax.net.ssl.SSLSocketFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.http.api.HttpException;
+
+/**
+ * An HTTP response.
+ */
+public class HttpResponse implements Response {
+
+  private Configuration conf;
+  private HttpBase http;
+  private URL url;
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+  // used for storing the http headers verbatim
+  private StringBuffer httpHeaders;
+
+  protected enum Scheme {
+    HTTP, HTTPS,
+  }
+
+  /**
+   * Default public constructor.
+   *
+   * @param http
+   * @param url
+   * @param datum
+   * @throws ProtocolException
+   * @throws IOException
+   */
+  public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
+      throws ProtocolException, IOException {
+
+    this.http = http;
+    this.url = url;
+    this.orig = url.toString();
+    this.base = url.toString();
+
+    Scheme scheme = null;
+
+    if ("http".equals(url.getProtocol())) {
+      scheme = Scheme.HTTP;
+    } else if ("https".equals(url.getProtocol())) {
+      scheme = Scheme.HTTPS;
+    } else {
+      throw new HttpException("Unknown scheme (not http/https) for url:" + url);
+    }
+
+    if (Http.LOG.isTraceEnabled()) {
+      Http.LOG.trace("fetching " + url);
+    }
+
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      if (scheme == Scheme.HTTP) {
+        port = 80;
+      } else {
+        port = 443;
+      }
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      if (scheme == Scheme.HTTPS) {
+        SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
+            .getDefault();
+        SSLSocket sslsocket = (SSLSocket) factory
+            .createSocket(socket, sockHost, sockPort, true);
+        sslsocket.setUseClientMode(true);
+
+        // Get the protocols and ciphers supported by this JVM
+        Set<String> protocols = new HashSet<String>(
+            Arrays.asList(sslsocket.getSupportedProtocols()));
+        Set<String> ciphers = new HashSet<String>(
+            Arrays.asList(sslsocket.getSupportedCipherSuites()));
+
+        // Intersect with preferred protocols and ciphers
+        protocols.retainAll(http.getTlsPreferredProtocols());
+        ciphers.retainAll(http.getTlsPreferredCipherSuites());
+
+        sslsocket.setEnabledProtocols(
+            protocols.toArray(new String[protocols.size()]));
+        sslsocket.setEnabledCipherSuites(
+            ciphers.toArray(new String[ciphers.size()]));
+
+        sslsocket.startHandshake();
+        socket = sslsocket;
+      }
+
+      this.conf = http.getConf();
+      if (sockAddr != null
+          && conf.getBoolean("store.ip.address", false) == true) {
+        headers.add("_ip_", sockAddr.getAddress().getHostAddress());
+      }
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy(url)) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + HttpDateFormat
+            .toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      // store the request in the metadata?
+      if (conf.getBoolean("store.http.request", false) == true) {
+        headers.add("_request_", reqStr.toString());
+      }
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(
+              new BufferedInputStream(socket.getInputStream(),
+                  Http.BUFFER_SIZE), Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      // store the http headers verbatim
+      if (conf.getBoolean("store.http.headers", false) == true) {
+        httpHeaders = new StringBuffer();
+      }
+
+      headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        if (httpHeaders != null)
+          httpHeaders.append(line).append("\n");
+        // parse headers
+        parseHeaders(in, line, httpHeaders);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
+      if (transferEncoding != null && "chunked"
+          .equalsIgnoreCase(transferEncoding.trim())) {
+        readChunkedContent(in, line);
+      } else {
+        readPlainContent(in);
+      }
+
+      String contentEncoding = getHeader(Response.CONTENT_ENCODING);
+      if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
+        content = http.processGzipEncoded(content, url);
+      } else if ("deflate".equals(contentEncoding)) {
+        content = http.processDeflateEncoded(content, url);
+      } else {
+        // store the headers verbatim only if the response was not compressed
+        // as the content length reported with not match otherwise
+        if (httpHeaders != null) {
+          headers.add("_response.headers_", httpHeaders.toString());
+        }
+        if (Http.LOG.isTraceEnabled()) {
+          Http.LOG.trace("fetched " + content.length + " bytes from " + url);
+        }
+      }
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+
+  }
+
+  /*
+   * ------------------------- * <implementation:Response> *
+   * -------------------------
+   */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /*
+   * ------------------------- * <implementation:Response> *
+   * -------------------------
+   */
+
+  private void readPlainContent(InputStream in)
+      throws HttpException, IOException {
+
+    int contentLength = Integer.MAX_VALUE; // get content length
+    String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+    if (contentLengthString != null) {
+      contentLengthString = contentLengthString.trim();
+      try {
+        if (!contentLengthString.isEmpty())
+          contentLength = Integer.parseInt(contentLengthString);
+      } catch (NumberFormatException e) {
+        throw new HttpException("bad content length: " + contentLengthString);
+      }
+    }
+    if (http.getMaxContent() >= 0 && contentLength > http
+        .getMaxContent()) // limit
+      // download
+      // size
+      contentLength = http.getMaxContent();
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
+    byte[] bytes = new byte[Http.BUFFER_SIZE];
+    int length = 0;
+
+    // do not try to read if the contentLength is 0
+    if (contentLength == 0) {
+      content = new byte[0];
+      return;
+    }
+
+    // read content
+    int i = in.read(bytes);
+    while (i != -1) {
+      out.write(bytes, 0, i);
+      length += i;
+      if (length >= contentLength) {
+        break;
+      }
+      if ((length + Http.BUFFER_SIZE) > contentLength) {
+        // reading next chunk may hit contentLength,
+        // must limit number of bytes read
+        i = in.read(bytes, 0, (contentLength - length));
+      } else {
+        i = in.read(bytes);
+      }
+    }
+    content = out.toByteArray();
+  }
+
+  /**
+   * @param in
+   * @param line
+   * @throws HttpException
+   * @throws IOException
+   */
+  private void readChunkedContent(PushbackInputStream in, StringBuffer line)
+      throws HttpException, IOException {
+    boolean doneChunks = false;
+    int contentBytesRead = 0;
+    byte[] bytes = new byte[Http.BUFFER_SIZE];
+    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
+
+    while (!doneChunks) {
+      if (Http.LOG.isTraceEnabled()) {
+        Http.LOG.trace("Http: starting chunk");
+      }
+
+      readLine(in, line, false);
+
+      String chunkLenStr;
+      // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'");
+      // }
+
+      int pos = line.indexOf(";");
+      if (pos < 0) {
+        chunkLenStr = line.toString();
+      } else {
+        chunkLenStr = line.substring(0, pos);
+        // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " +
+        // line.substring(pos+1)); }
+      }
+      chunkLenStr = chunkLenStr.trim();
+      int chunkLen;
+      try {
+        chunkLen = Integer.parseInt(chunkLenStr, 16);
+      } catch (NumberFormatException e) {
+        throw new HttpException("bad chunk length: " + line.toString());
+      }
+
+      if (chunkLen == 0) {
+        doneChunks = true;
+        break;
+      }
+
+      if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http
+          .getMaxContent())
+        chunkLen = http.getMaxContent() - contentBytesRead;
+
+      // read one chunk
+      int chunkBytesRead = 0;
+      while (chunkBytesRead < chunkLen) {
+
+        int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
+            (chunkLen - chunkBytesRead) :
+            Http.BUFFER_SIZE;
+        int len = in.read(bytes, 0, toRead);
+
+        if (len == -1)
+          throw new HttpException("chunk eof after " + contentBytesRead
+              + " bytes in successful chunks" + " and " + chunkBytesRead
+              + " in current chunk");
+
+        // DANGER!!! Will printed GZIPed stuff right to your
+        // terminal!
+        // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0,
+        // len)); }
+
+        out.write(bytes, 0, len);
+        chunkBytesRead += len;
+      }
+
+      readLine(in, line, false);
+
+    }
+
+    if (!doneChunks) {
+      if (contentBytesRead != http.getMaxContent())
+        throw new HttpException("chunk eof: !doneChunk && didn't max out");
+      return;
+    }
+
+    content = out.toByteArray();
+    parseHeaders(in, line, null);
+
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line)
+      throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException(
+          "bad status line '" + line + "': " + e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line)
+      throws IOException, HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line,
+      StringBuffer httpHeaders) throws IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      if (httpHeaders != null)
+        httpHeaders.append(line).append("\n");
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || (
+          (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html"))
+          != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          // TODO: (CM) We don't know the header names here
+          // since we're just handling them generically. It would
+          // be nice to provide some sort of mapping function here
+          // for the returned header names to the standard metadata
+          // names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line,
+      boolean allowContinuedLine) throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/package.html b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/package.html
new file mode 100644
index 0000000..34d1d1c
--- /dev/null
+++ b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/test/conf/nutch-site-test.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/src/test/conf/nutch-site-test.xml b/nutch-plugins/protocol-http/src/test/conf/nutch-site-test.xml
new file mode 100644
index 0000000..a9afd78
--- /dev/null
+++ b/nutch-plugins/protocol-http/src/test/conf/nutch-site-test.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<configuration>
+
+<property>
+  <name>http.robots.agents</name>
+  <value>Nutch-Test,*</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.agent.name</name>
+  <value>Nutch-Test</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.agent.description</name>
+  <value>Nutch protocol-httpclient test</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.auth.file</name>
+  <value>httpclient-auth-test.xml</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.timeout</name>
+  <value>60000</value>
+  <description></description>
+</property>
+
+</configuration>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java b/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java
new file mode 100644
index 0000000..7dd9e9b
--- /dev/null
+++ b/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http;
+
+import static org.junit.Assert.assertEquals;
+
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.junit.After;
+import org.junit.Test;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.nio.SelectChannelConnector;
+import org.mortbay.jetty.servlet.Context;
+import org.mortbay.jetty.servlet.ServletHolder;
+
+/**
+ * Test cases for protocol-http
+ */
+public class TestProtocolHttp {
+  private static final String RES_DIR = System.getProperty("test.data", ".");
+
+  private Http http;
+  private Server server;
+  private Context root;
+  private Configuration conf;
+  private int port;
+
+  public void setUp(boolean redirection) throws Exception {
+    conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("nutch-site-test.xml");
+
+    http = new Http();
+    http.setConf(conf);
+
+    server = new Server();
+
+    if (redirection) {
+      root = new Context(server, "/redirection", Context.SESSIONS);
+      root.setAttribute("newContextURL", "/redirect");
+    } else {
+      root = new Context(server, "/", Context.SESSIONS);
+    }
+
+    ServletHolder sh = new ServletHolder(
+        org.apache.jasper.servlet.JspServlet.class);
+    root.addServlet(sh, "*.jsp");
+    root.setResourceBase(RES_DIR);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    server.stop();
+  }
+
+  @Test
+  public void testStatusCode() throws Exception {
+    startServer(47504, false);
+    fetchPage("/basic-http.jsp", 200);
+    fetchPage("/redirect301.jsp", 301);
+    fetchPage("/redirect302.jsp", 302);
+    fetchPage("/nonexists.html", 404);
+    fetchPage("/brokenpage.jsp", 500);
+  }
+
+  @Test
+  public void testRedirectionJetty() throws Exception {
+    // Redirection via Jetty
+    startServer(47503, true);
+    fetchPage("/redirection", 302);
+  }
+
+  /**
+   * Starts the Jetty server at a specified port and redirection parameter.
+   * 
+   * @param portno
+   *          Port number.
+   * @param redirection
+   *          whether redirection
+   */
+  private void startServer(int portno, boolean redirection) throws Exception {
+    port = portno;
+    setUp(redirection);
+    SelectChannelConnector connector = new SelectChannelConnector();
+    connector.setHost("127.0.0.1");
+    connector.setPort(port);
+
+    server.addConnector(connector);
+    server.start();
+  }
+
+  /**
+   * Fetches the specified <code>page</code> from the local Jetty server and
+   * checks whether the HTTP response status code matches with the expected
+   * code. Also use jsp pages for redirection.
+   * 
+   * @param page
+   *          Page to be fetched.
+   * @param expectedCode
+   *          HTTP response status code expected while fetching the page.
+   */
+  private void fetchPage(String page, int expectedCode) throws Exception {
+    URL url = new URL("http", "127.0.0.1", port, page);
+    CrawlDatum crawlDatum = new CrawlDatum();
+    Response response = http.getResponse(url, crawlDatum, true);
+    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
+        crawlDatum);
+    Content content = out.getContent();
+    assertEquals("HTTP Status Code for " + url, expectedCode,
+        response.getCode());
+
+    if (page.compareTo("/nonexists.html") != 0
+        && page.compareTo("/brokenpage.jsp") != 0
+        && page.compareTo("/redirection") != 0) {
+      assertEquals("ContentType " + url, "text/html",
+          content.getContentType());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/build.xml b/nutch-plugins/protocol-httpclient/build.xml
new file mode 100644
index 0000000..b66eb97
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/build.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-httpclient" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+  </target>
+
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+    <pathelement location="${build.dir}/test/conf"/>
+  </path>
+
+  <target name="deps-test">
+    <copy toDir="${build.test}">
+      <fileset dir="${src.test}" excludes="**/*.java"/>
+    </copy>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+      <fileset dir="jsp"/>
+   </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/ivy.xml b/nutch-plugins/protocol-httpclient/ivy.xml
new file mode 100644
index 0000000..00b6f07
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="org.jsoup" name="jsoup" rev="1.8.1" />
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/basic.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/jsp/basic.jsp b/nutch-plugins/protocol-httpclient/jsp/basic.jsp
new file mode 100644
index 0000000..c5bfb89
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/jsp/basic.jsp
@@ -0,0 +1,74 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  This JSP demonstrates basic authentication. When this JSP page is
+  requested with no query parameters, then the user must enter the
+  username as 'userx' and password as 'passx' when prompted for
+  authentication. Apart from this there are a few other test cases,
+  which can be used by passing a test case number as query parameter in
+  the following manner: basic.jsp?case=1, basic.jsp?case=2, etc.
+  The credentials for each test case can be easily figured out from the
+  code below.
+
+  Author: Susam Pal
+--%><%@ page
+    import = "sun.misc.BASE64Decoder"
+%><%
+  String authHeader = request.getHeader("Authorization");
+  String realm = null;
+  String username = null;
+  String password = null;
+  int testCase = 0;
+  try {
+    testCase = Integer.parseInt(request.getParameter("case"));
+  } catch (Exception ex) {
+    // do nothing
+  }
+  switch (testCase) {
+    case 1:
+      realm = "realm1"; username = "user1"; password = "pass1";
+      break;
+
+    case 2:
+      realm = "realm2"; username = "user2"; password = "pass2";
+      break;
+
+    default:
+      realm = "realmx"; username = "userx"; password = "passx";
+      break;
+  }
+
+  boolean authenticated = false;
+  if (authHeader != null && authHeader.toUpperCase().startsWith("BASIC")) {
+    String creds[] = new String(new BASE64Decoder().decodeBuffer(
+        authHeader.substring(6))).split(":", 2);
+    if (creds[0].equals(username) && creds[1].equals(password))
+          authenticated = true;
+  }
+  if (!authenticated) {
+    response.setHeader("WWW-Authenticate", "Basic realm=\"" + realm + "\"");
+    response.sendError(response.SC_UNAUTHORIZED);
+  } else {
+%>
+<html>
+<head><title>Basic Authentication Test</title></head>
+<body>
+<p>Hi <%= username %>, you have been successfully authenticated.</p>
+</body>
+</html>
+<%
+  }
+%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/cookies.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/jsp/cookies.jsp b/nutch-plugins/protocol-httpclient/jsp/cookies.jsp
new file mode 100644
index 0000000..ae2ace2
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/jsp/cookies.jsp
@@ -0,0 +1,63 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  This JSP tests whether the client can remember cookies. When the JSP
+  is fetched for the first time without any query parameters, it sets
+  a few cookies in the client. On a second request, with the query
+  parameter, 'cookie=yes', it checks whether all the client has sent
+  the cookies. If the cookies are found, HTTP 200 response is returned.
+  If the cookies are not found, HTTP 403 response is returned.
+
+  Author: Susam Pal
+--%><%
+  String cookieParam = request.getParameter("cookie");
+  if (!"yes".equals(cookieParam)) { // Send cookies
+    response.addCookie(new Cookie("var1", "val1"));
+    response.addCookie(new Cookie("var2", "val2"));
+%>
+<html>
+<head><title>Cookies Set</title></head>
+<body><p>Cookies have been set.</p></body>
+</html>
+<%
+  } else { // Check cookies
+    int cookiesCount = 0;
+
+    Cookie[] cookies = request.getCookies();
+    if (cookies != null) {
+      for (int i = 0; i < cookies.length; i++) {
+        if (cookies[i].getName().equals("var1")
+            && cookies[i].getValue().equals("val1"))
+          cookiesCount++;
+
+        if (cookies[i].getName().equals("var2")
+            && cookies[i].getValue().equals("val2"))
+          cookiesCount++;
+      }
+    }
+
+    if (cookiesCount != 2) {
+      response.sendError(response.SC_FORBIDDEN);
+    } else {
+%>
+<html>
+<head><title>Cookies Found</title></head>
+<body><p>Cookies found!</p></body>
+</html>
+<%
+    }
+  }
+%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/digest.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/jsp/digest.jsp b/nutch-plugins/protocol-httpclient/jsp/digest.jsp
new file mode 100644
index 0000000..c657484
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/jsp/digest.jsp
@@ -0,0 +1,68 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  This JSP tests digest authentication. It generates an HTTP response
+  with authorization header for digest authentication and checks the
+  user-name supplied by the client. It does not check the other
+  parameters and hashes as controlled JUnit tests would be performed
+  against this and only the proper submission of credentials need to
+  be tested.
+
+  Author: Susam Pal
+--%><%@ page
+    import = "java.util.StringTokenizer"
+    import = "java.util.HashMap"
+%><%
+  String username = "digest_user";
+  String authHeader = request.getHeader("Authorization");
+  
+  boolean authenticated = false;
+  if (authHeader != null && authHeader.toUpperCase().startsWith("DIGEST")) {
+    HashMap map = new HashMap();
+    StringTokenizer tokenizer = new StringTokenizer(
+        authHeader.substring(7).trim(), ",");
+    while (tokenizer.hasMoreTokens()) {
+      String[] param = tokenizer.nextToken().trim().split("=", 2);
+      if (param[1].charAt(0) == '"') {
+        param[1] = param[1].substring(1, param[1].length() - 1);
+      }
+      map.put(param[0], param[1]);
+    }
+
+    if (username.equals((String)map.get("username")))
+      authenticated = true;
+  }
+
+  if (!authenticated) {
+    String realm = "realm=\"realm1\"";
+    String qop   = "qop=\"auth,auth-int\"";
+    String nonce = "nonce=\"dcd98b7102dd2f0e8b11d0f600bfb0c093\"";
+    String opaque = "opaque=\"5ccc069c403ebaf9f0171e9517f40e41\"";
+
+    response.setHeader("WWW-Authenticate", "Digest " + realm + ", "
+        + qop + ", " + nonce + ", " + opaque);
+    response.sendError(response.SC_UNAUTHORIZED);
+  } else {
+%>
+<html>
+<head><title>Digest Authentication Test</title></head>
+<body>
+<p>Hi <%= username %>, you have been successfully authenticated.</p>
+</body>
+</html>
+<%
+  }
+%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/noauth.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/jsp/noauth.jsp b/nutch-plugins/protocol-httpclient/jsp/noauth.jsp
new file mode 100644
index 0000000..c726b0f
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/jsp/noauth.jsp
@@ -0,0 +1,36 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  This JSP tests whether the client is sending any pre-emptive
+  authentication headers. The client is expected not to send pre-emptive
+  authentication headers. If such authentication headers are found, this
+  JSP will return an HTTP 403 response; HTTP 200 response otherwise.
+
+  Author: Susam Pal
+--%><%
+  if (request.getHeader("Authorization") != null) {
+    response.sendError(response.SC_UNAUTHORIZED);
+  } else {
+%>
+<html>
+<head><title>No authorization headers found</title></head>
+<body>
+<p>No authorization headers found.</p>
+</body>
+</html>
+<%
+  }
+%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/ntlm.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/jsp/ntlm.jsp b/nutch-plugins/protocol-httpclient/jsp/ntlm.jsp
new file mode 100644
index 0000000..6ad921e
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/jsp/ntlm.jsp
@@ -0,0 +1,89 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  This JSP tests NTLM authentication. It generates an HTTP response
+  with authorization header for NTLM authentication and checks the
+  user-name supplied by the client. It does not check the other
+  parameters and hashes as controlled JUnit tests would be performed
+  against this and only the proper submission of credentials need to
+  be tested.
+
+  Author: Susam Pal
+--%><%@ page
+    import = "sun.misc.BASE64Decoder"
+    import = "sun.misc.BASE64Encoder"
+%><%
+  String authHeader = request.getHeader("Authorization");
+  String username = null;
+  String domain = null;
+  String host = null;
+
+  boolean authenticated = false;
+  if (authHeader != null && authHeader.startsWith("NTLM")) {
+    byte[] msg = new BASE64Decoder().decodeBuffer(
+        authHeader.substring(5));
+    if (msg[8] == 1) {
+      byte[] type2msg = {
+          'N', 'T', 'L', 'M', 'S', 'S', 'P', 0, // NTLMSSP Signature
+          2, 0, 0, 0,                           // Type 2 Indicator
+          10, 0, 10, 0, 32, 0, 0, 0,            // length, offset
+          0x00, 0x02, (byte) 0x81, 0,           // Flags
+          1, 2, 3, 4, 5, 6, 7, 8,               // Challenge
+          'N', 'U', 'T', 'C', 'H' // NUTCH (Domain)
+      };
+      response.setHeader("WWW-Authenticate", "NTLM "
+          + new BASE64Encoder().encodeBuffer(type2msg));
+      response.sendError(response.SC_UNAUTHORIZED);
+      return;
+    } else if (msg[8] == 3) {
+      int length;
+      int offset;
+
+      // Get domain name
+      length = msg[30] + msg[31] * 256;
+      offset = msg[32] + msg[33] * 256;
+      domain = new String(msg, offset, length);
+
+      // Get user name
+      length = msg[38] + msg[39] * 256;
+      offset = msg[40] + msg[41] * 256;
+      username = new String(msg, offset, length);
+
+      // Get password
+      length = msg[46] + msg[47] * 256;
+      offset = msg[48] + msg[49] * 256;
+      host = new String(msg, offset, length);
+
+      if ("ntlm_user".equalsIgnoreCase(username)
+          && "NUTCH".equalsIgnoreCase(domain))
+        authenticated = true;
+    }
+  }
+
+  if (!authenticated) {
+    response.setHeader("WWW-Authenticate", "NTLM");
+    response.sendError(response.SC_UNAUTHORIZED);
+  } else {
+%>
+<html>
+<head>NTLM Authentication Test</head>
+<body>
+<p>Hi <%= username %>, You have been successfully authenticated.</p>
+</body>
+</html>
+<%
+  }
+%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/plugin.xml b/nutch-plugins/protocol-httpclient/plugin.xml
new file mode 100644
index 0000000..1747713
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/plugin.xml
@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+   
+   http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<plugin
+   id="protocol-httpclient"
+   name="Http / Https Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+   
+   <runtime>
+      <library name="protocol-httpclient.jar">
+         <export name="*"/>
+      </library>
+      <library name="jsoup-1.8.1.jar"/>
+   </runtime>
+   
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+   </requires>
+   
+   <extension id="org.apache.nutch.protocol.httpclient"
+      name="HttpProtocol"
+      point="org.apache.nutch.protocol.Protocol">
+      
+      <implementation id="org.apache.nutch.protocol.httpclient.Http"
+         class="org.apache.nutch.protocol.httpclient.Http">
+         <parameter name="protocolName" value="http"/>
+      </implementation>
+      
+   </extension>
+   
+   <extension id="org.apache.nutch.protocol.https"
+      name="HttpsProtocol"
+      point="org.apache.nutch.protocol.Protocol">
+      
+      <implementation id="org.apache.nutch.protocol.httpclient.Http"
+         class="org.apache.nutch.protocol.httpclient.Http">
+         <parameter name="protocolName" value="https"/>
+      </implementation>
+      
+   </extension>
+   
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/pom.xml b/nutch-plugins/protocol-httpclient/pom.xml
new file mode 100644
index 0000000..2f2fc7c
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/pom.xml
@@ -0,0 +1,62 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-httpclient</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-httpclient</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.jsoup</groupId>
+            <artifactId>jsoup</artifactId>
+            <version>1.8.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-http</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId> org.mortbay.jetty</groupId>
+            <artifactId>jetty</artifactId>
+            <version>6.1.26</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId> org.mortbay.jetty</groupId>
+            <artifactId>jsp-2.1</artifactId>
+            <version>6.1.14</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+</project>