You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:48 UTC

[04/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
deleted file mode 100644
index f6d7e4d..0000000
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ /dev/null
@@ -1,558 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.http;
-
-import java.io.BufferedInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.PushbackInputStream;
-import java.net.InetSocketAddress;
-import java.net.Socket;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.net.ssl.SSLSocket;
-import javax.net.ssl.SSLSocketFactory;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.metadata.SpellCheckedMetadata;
-import org.apache.nutch.net.protocols.HttpDateFormat;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.nutch.protocol.http.api.HttpException;
-
-/**
- * An HTTP response.
- */
-public class HttpResponse implements Response {
-
-  private Configuration conf;
-  private HttpBase http;
-  private URL url;
-  private String orig;
-  private String base;
-  private byte[] content;
-  private int code;
-  private Metadata headers = new SpellCheckedMetadata();
-  // used for storing the http headers verbatim
-  private StringBuffer httpHeaders;
-
-  protected enum Scheme {
-    HTTP, HTTPS,
-  }
-
-  /**
-   * Default public constructor.
-   *
-   * @param http
-   * @param url
-   * @param datum
-   * @throws ProtocolException
-   * @throws IOException
-   */
-  public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
-      throws ProtocolException, IOException {
-
-    this.http = http;
-    this.url = url;
-    this.orig = url.toString();
-    this.base = url.toString();
-
-    Scheme scheme = null;
-
-    if ("http".equals(url.getProtocol())) {
-      scheme = Scheme.HTTP;
-    } else if ("https".equals(url.getProtocol())) {
-      scheme = Scheme.HTTPS;
-    } else {
-      throw new HttpException("Unknown scheme (not http/https) for url:" + url);
-    }
-
-    if (Http.LOG.isTraceEnabled()) {
-      Http.LOG.trace("fetching " + url);
-    }
-
-    String path = "".equals(url.getFile()) ? "/" : url.getFile();
-
-    // some servers will redirect a request with a host line like
-    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
-    // don't want the :80...
-
-    String host = url.getHost();
-    int port;
-    String portString;
-    if (url.getPort() == -1) {
-      if (scheme == Scheme.HTTP) {
-        port = 80;
-      } else {
-        port = 443;
-      }
-      portString = "";
-    } else {
-      port = url.getPort();
-      portString = ":" + port;
-    }
-    Socket socket = null;
-
-    try {
-      socket = new Socket(); // create the socket
-      socket.setSoTimeout(http.getTimeout());
-
-      // connect
-      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
-      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
-      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
-      socket.connect(sockAddr, http.getTimeout());
-
-      if (scheme == Scheme.HTTPS) {
-        SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
-            .getDefault();
-        SSLSocket sslsocket = (SSLSocket) factory
-            .createSocket(socket, sockHost, sockPort, true);
-        sslsocket.setUseClientMode(true);
-
-        // Get the protocols and ciphers supported by this JVM
-        Set<String> protocols = new HashSet<String>(
-            Arrays.asList(sslsocket.getSupportedProtocols()));
-        Set<String> ciphers = new HashSet<String>(
-            Arrays.asList(sslsocket.getSupportedCipherSuites()));
-
-        // Intersect with preferred protocols and ciphers
-        protocols.retainAll(http.getTlsPreferredProtocols());
-        ciphers.retainAll(http.getTlsPreferredCipherSuites());
-
-        sslsocket.setEnabledProtocols(
-            protocols.toArray(new String[protocols.size()]));
-        sslsocket.setEnabledCipherSuites(
-            ciphers.toArray(new String[ciphers.size()]));
-
-        sslsocket.startHandshake();
-        socket = sslsocket;
-      }
-
-      this.conf = http.getConf();
-      if (sockAddr != null
-          && conf.getBoolean("store.ip.address", false) == true) {
-        headers.add("_ip_", sockAddr.getAddress().getHostAddress());
-      }
-
-      // make request
-      OutputStream req = socket.getOutputStream();
-
-      StringBuffer reqStr = new StringBuffer("GET ");
-      if (http.useProxy(url)) {
-        reqStr.append(url.getProtocol() + "://" + host + portString + path);
-      } else {
-        reqStr.append(path);
-      }
-
-      reqStr.append(" HTTP/1.0\r\n");
-
-      reqStr.append("Host: ");
-      reqStr.append(host);
-      reqStr.append(portString);
-      reqStr.append("\r\n");
-
-      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
-
-      String userAgent = http.getUserAgent();
-      if ((userAgent == null) || (userAgent.length() == 0)) {
-        if (Http.LOG.isErrorEnabled()) {
-          Http.LOG.error("User-agent is not set!");
-        }
-      } else {
-        reqStr.append("User-Agent: ");
-        reqStr.append(userAgent);
-        reqStr.append("\r\n");
-      }
-
-      reqStr.append("Accept-Language: ");
-      reqStr.append(this.http.getAcceptLanguage());
-      reqStr.append("\r\n");
-
-      reqStr.append("Accept: ");
-      reqStr.append(this.http.getAccept());
-      reqStr.append("\r\n");
-
-      if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
-        reqStr.append("If-Modified-Since: " + HttpDateFormat
-            .toString(datum.getModifiedTime()));
-        reqStr.append("\r\n");
-      }
-      reqStr.append("\r\n");
-
-      // store the request in the metadata?
-      if (conf.getBoolean("store.http.request", false) == true) {
-        headers.add("_request_", reqStr.toString());
-      }
-
-      byte[] reqBytes = reqStr.toString().getBytes();
-
-      req.write(reqBytes);
-      req.flush();
-
-      PushbackInputStream in = // process response
-          new PushbackInputStream(
-              new BufferedInputStream(socket.getInputStream(),
-                  Http.BUFFER_SIZE), Http.BUFFER_SIZE);
-
-      StringBuffer line = new StringBuffer();
-
-      // store the http headers verbatim
-      if (conf.getBoolean("store.http.headers", false) == true) {
-        httpHeaders = new StringBuffer();
-      }
-
-      headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
-
-      boolean haveSeenNonContinueStatus = false;
-      while (!haveSeenNonContinueStatus) {
-        // parse status code line
-        this.code = parseStatusLine(in, line);
-        if (httpHeaders != null)
-          httpHeaders.append(line).append("\n");
-        // parse headers
-        parseHeaders(in, line, httpHeaders);
-        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
-      }
-
-      String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
-      if (transferEncoding != null && "chunked"
-          .equalsIgnoreCase(transferEncoding.trim())) {
-        readChunkedContent(in, line);
-      } else {
-        readPlainContent(in);
-      }
-
-      String contentEncoding = getHeader(Response.CONTENT_ENCODING);
-      if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
-        content = http.processGzipEncoded(content, url);
-      } else if ("deflate".equals(contentEncoding)) {
-        content = http.processDeflateEncoded(content, url);
-      } else {
-        // store the headers verbatim only if the response was not compressed
-        // as the content length reported with not match otherwise
-        if (httpHeaders != null) {
-          headers.add("_response.headers_", httpHeaders.toString());
-        }
-        if (Http.LOG.isTraceEnabled()) {
-          Http.LOG.trace("fetched " + content.length + " bytes from " + url);
-        }
-      }
-
-    } finally {
-      if (socket != null)
-        socket.close();
-    }
-
-  }
-
-  /*
-   * ------------------------- * <implementation:Response> *
-   * -------------------------
-   */
-
-  public URL getUrl() {
-    return url;
-  }
-
-  public int getCode() {
-    return code;
-  }
-
-  public String getHeader(String name) {
-    return headers.get(name);
-  }
-
-  public Metadata getHeaders() {
-    return headers;
-  }
-
-  public byte[] getContent() {
-    return content;
-  }
-
-  /*
-   * ------------------------- * <implementation:Response> *
-   * -------------------------
-   */
-
-  private void readPlainContent(InputStream in)
-      throws HttpException, IOException {
-
-    int contentLength = Integer.MAX_VALUE; // get content length
-    String contentLengthString = headers.get(Response.CONTENT_LENGTH);
-    if (contentLengthString != null) {
-      contentLengthString = contentLengthString.trim();
-      try {
-        if (!contentLengthString.isEmpty())
-          contentLength = Integer.parseInt(contentLengthString);
-      } catch (NumberFormatException e) {
-        throw new HttpException("bad content length: " + contentLengthString);
-      }
-    }
-    if (http.getMaxContent() >= 0 && contentLength > http
-        .getMaxContent()) // limit
-      // download
-      // size
-      contentLength = http.getMaxContent();
-
-    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
-    byte[] bytes = new byte[Http.BUFFER_SIZE];
-    int length = 0;
-
-    // do not try to read if the contentLength is 0
-    if (contentLength == 0) {
-      content = new byte[0];
-      return;
-    }
-
-    // read content
-    int i = in.read(bytes);
-    while (i != -1) {
-      out.write(bytes, 0, i);
-      length += i;
-      if (length >= contentLength) {
-        break;
-      }
-      if ((length + Http.BUFFER_SIZE) > contentLength) {
-        // reading next chunk may hit contentLength,
-        // must limit number of bytes read
-        i = in.read(bytes, 0, (contentLength - length));
-      } else {
-        i = in.read(bytes);
-      }
-    }
-    content = out.toByteArray();
-  }
-
-  /**
-   * @param in
-   * @param line
-   * @throws HttpException
-   * @throws IOException
-   */
-  private void readChunkedContent(PushbackInputStream in, StringBuffer line)
-      throws HttpException, IOException {
-    boolean doneChunks = false;
-    int contentBytesRead = 0;
-    byte[] bytes = new byte[Http.BUFFER_SIZE];
-    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
-
-    while (!doneChunks) {
-      if (Http.LOG.isTraceEnabled()) {
-        Http.LOG.trace("Http: starting chunk");
-      }
-
-      readLine(in, line, false);
-
-      String chunkLenStr;
-      // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'");
-      // }
-
-      int pos = line.indexOf(";");
-      if (pos < 0) {
-        chunkLenStr = line.toString();
-      } else {
-        chunkLenStr = line.substring(0, pos);
-        // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " +
-        // line.substring(pos+1)); }
-      }
-      chunkLenStr = chunkLenStr.trim();
-      int chunkLen;
-      try {
-        chunkLen = Integer.parseInt(chunkLenStr, 16);
-      } catch (NumberFormatException e) {
-        throw new HttpException("bad chunk length: " + line.toString());
-      }
-
-      if (chunkLen == 0) {
-        doneChunks = true;
-        break;
-      }
-
-      if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http
-          .getMaxContent())
-        chunkLen = http.getMaxContent() - contentBytesRead;
-
-      // read one chunk
-      int chunkBytesRead = 0;
-      while (chunkBytesRead < chunkLen) {
-
-        int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
-            (chunkLen - chunkBytesRead) :
-            Http.BUFFER_SIZE;
-        int len = in.read(bytes, 0, toRead);
-
-        if (len == -1)
-          throw new HttpException("chunk eof after " + contentBytesRead
-              + " bytes in successful chunks" + " and " + chunkBytesRead
-              + " in current chunk");
-
-        // DANGER!!! Will printed GZIPed stuff right to your
-        // terminal!
-        // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0,
-        // len)); }
-
-        out.write(bytes, 0, len);
-        chunkBytesRead += len;
-      }
-
-      readLine(in, line, false);
-
-    }
-
-    if (!doneChunks) {
-      if (contentBytesRead != http.getMaxContent())
-        throw new HttpException("chunk eof: !doneChunk && didn't max out");
-      return;
-    }
-
-    content = out.toByteArray();
-    parseHeaders(in, line, null);
-
-  }
-
-  private int parseStatusLine(PushbackInputStream in, StringBuffer line)
-      throws IOException, HttpException {
-    readLine(in, line, false);
-
-    int codeStart = line.indexOf(" ");
-    int codeEnd = line.indexOf(" ", codeStart + 1);
-
-    // handle lines with no plaintext result code, ie:
-    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
-    if (codeEnd == -1)
-      codeEnd = line.length();
-
-    int code;
-    try {
-      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
-    } catch (NumberFormatException e) {
-      throw new HttpException(
-          "bad status line '" + line + "': " + e.getMessage(), e);
-    }
-
-    return code;
-  }
-
-  private void processHeaderLine(StringBuffer line)
-      throws IOException, HttpException {
-
-    int colonIndex = line.indexOf(":"); // key is up to colon
-    if (colonIndex == -1) {
-      int i;
-      for (i = 0; i < line.length(); i++)
-        if (!Character.isWhitespace(line.charAt(i)))
-          break;
-      if (i == line.length())
-        return;
-      throw new HttpException("No colon in header:" + line);
-    }
-    String key = line.substring(0, colonIndex);
-
-    int valueStart = colonIndex + 1; // skip whitespace
-    while (valueStart < line.length()) {
-      int c = line.charAt(valueStart);
-      if (c != ' ' && c != '\t')
-        break;
-      valueStart++;
-    }
-    String value = line.substring(valueStart);
-    headers.set(key, value);
-  }
-
-  // Adds headers to our headers Metadata
-  private void parseHeaders(PushbackInputStream in, StringBuffer line,
-      StringBuffer httpHeaders) throws IOException, HttpException {
-
-    while (readLine(in, line, true) != 0) {
-
-      if (httpHeaders != null)
-        httpHeaders.append(line).append("\n");
-
-      // handle HTTP responses with missing blank line after headers
-      int pos;
-      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || (
-          (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html"))
-          != -1)) {
-
-        in.unread(line.substring(pos).getBytes("UTF-8"));
-        line.setLength(pos);
-
-        try {
-          // TODO: (CM) We don't know the header names here
-          // since we're just handling them generically. It would
-          // be nice to provide some sort of mapping function here
-          // for the returned header names to the standard metadata
-          // names in the ParseData class
-          processHeaderLine(line);
-        } catch (Exception e) {
-          // fixme:
-          Http.LOG.warn("Error: ", e);
-        }
-        return;
-      }
-
-      processHeaderLine(line);
-    }
-  }
-
-  private static int readLine(PushbackInputStream in, StringBuffer line,
-      boolean allowContinuedLine) throws IOException {
-    line.setLength(0);
-    for (int c = in.read(); c != -1; c = in.read()) {
-      switch (c) {
-      case '\r':
-        if (peek(in) == '\n') {
-          in.read();
-        }
-      case '\n':
-        if (line.length() > 0) {
-          // at EOL -- check for continued line if the current
-          // (possibly continued) line wasn't blank
-          if (allowContinuedLine)
-            switch (peek(in)) {
-            case ' ':
-            case '\t': // line is continued
-              in.read();
-              continue;
-            }
-        }
-        return line.length(); // else complete
-      default:
-        line.append((char) c);
-      }
-    }
-    throw new EOFException();
-  }
-
-  private static int peek(PushbackInputStream in) throws IOException {
-    int value = in.read();
-    in.unread(value);
-    return value;
-  }
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html
deleted file mode 100644
index 34d1d1c..0000000
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml b/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml
deleted file mode 100644
index a9afd78..0000000
--- a/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<configuration>
-
-<property>
-  <name>http.robots.agents</name>
-  <value>Nutch-Test,*</value>
-  <description></description>
-</property>
-
-<property>
-  <name>http.agent.name</name>
-  <value>Nutch-Test</value>
-  <description></description>
-</property>
-
-<property>
-  <name>http.agent.description</name>
-  <value>Nutch protocol-httpclient test</value>
-  <description></description>
-</property>
-
-<property>
-  <name>http.auth.file</name>
-  <value>httpclient-auth-test.xml</value>
-  <description></description>
-</property>
-
-<property>
-  <name>http.timeout</name>
-  <value>60000</value>
-  <description></description>
-</property>
-
-</configuration>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
deleted file mode 100644
index 7dd9e9b..0000000
--- a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.http;
-
-import static org.junit.Assert.assertEquals;
-
-import java.net.URL;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.junit.After;
-import org.junit.Test;
-import org.mortbay.jetty.Server;
-import org.mortbay.jetty.nio.SelectChannelConnector;
-import org.mortbay.jetty.servlet.Context;
-import org.mortbay.jetty.servlet.ServletHolder;
-
-/**
- * Test cases for protocol-http
- */
-public class TestProtocolHttp {
-  private static final String RES_DIR = System.getProperty("test.data", ".");
-
-  private Http http;
-  private Server server;
-  private Context root;
-  private Configuration conf;
-  private int port;
-
-  public void setUp(boolean redirection) throws Exception {
-    conf = new Configuration();
-    conf.addResource("nutch-default.xml");
-    conf.addResource("nutch-site-test.xml");
-
-    http = new Http();
-    http.setConf(conf);
-
-    server = new Server();
-
-    if (redirection) {
-      root = new Context(server, "/redirection", Context.SESSIONS);
-      root.setAttribute("newContextURL", "/redirect");
-    } else {
-      root = new Context(server, "/", Context.SESSIONS);
-    }
-
-    ServletHolder sh = new ServletHolder(
-        org.apache.jasper.servlet.JspServlet.class);
-    root.addServlet(sh, "*.jsp");
-    root.setResourceBase(RES_DIR);
-  }
-
-  @After
-  public void tearDown() throws Exception {
-    server.stop();
-  }
-
-  @Test
-  public void testStatusCode() throws Exception {
-    startServer(47504, false);
-    fetchPage("/basic-http.jsp", 200);
-    fetchPage("/redirect301.jsp", 301);
-    fetchPage("/redirect302.jsp", 302);
-    fetchPage("/nonexists.html", 404);
-    fetchPage("/brokenpage.jsp", 500);
-  }
-
-  @Test
-  public void testRedirectionJetty() throws Exception {
-    // Redirection via Jetty
-    startServer(47503, true);
-    fetchPage("/redirection", 302);
-  }
-
-  /**
-   * Starts the Jetty server at a specified port and redirection parameter.
-   * 
-   * @param portno
-   *          Port number.
-   * @param redirection
-   *          whether redirection
-   */
-  private void startServer(int portno, boolean redirection) throws Exception {
-    port = portno;
-    setUp(redirection);
-    SelectChannelConnector connector = new SelectChannelConnector();
-    connector.setHost("127.0.0.1");
-    connector.setPort(port);
-
-    server.addConnector(connector);
-    server.start();
-  }
-
-  /**
-   * Fetches the specified <code>page</code> from the local Jetty server and
-   * checks whether the HTTP response status code matches with the expected
-   * code. Also use jsp pages for redirection.
-   * 
-   * @param page
-   *          Page to be fetched.
-   * @param expectedCode
-   *          HTTP response status code expected while fetching the page.
-   */
-  private void fetchPage(String page, int expectedCode) throws Exception {
-    URL url = new URL("http", "127.0.0.1", port, page);
-    CrawlDatum crawlDatum = new CrawlDatum();
-    Response response = http.getResponse(url, crawlDatum, true);
-    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
-        crawlDatum);
-    Content content = out.getContent();
-    assertEquals("HTTP Status Code for " + url, expectedCode,
-        response.getCode());
-
-    if (page.compareTo("/nonexists.html") != 0
-        && page.compareTo("/brokenpage.jsp") != 0
-        && page.compareTo("/redirection") != 0) {
-      assertEquals("ContentType " + url, "text/html",
-          content.getContentType());
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/build.xml b/src/plugin/protocol-httpclient/build.xml
deleted file mode 100644
index b66eb97..0000000
--- a/src/plugin/protocol-httpclient/build.xml
+++ /dev/null
@@ -1,45 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="protocol-httpclient" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <target name="deps-jar">
-    <ant target="jar" inheritall="false" dir="../lib-http"/>
-  </target>
-
-  <path id="plugin.deps">
-    <fileset dir="${nutch.root}/build">
-      <include name="**/lib-http/*.jar" />
-    </fileset>
-    <pathelement location="${build.dir}/test/conf"/>
-  </path>
-
-  <target name="deps-test">
-    <copy toDir="${build.test}">
-      <fileset dir="${src.test}" excludes="**/*.java"/>
-    </copy>
-  </target>
-
-  <!-- for junit test -->
-  <mkdir dir="${build.test}/data" />
-  <copy todir="${build.test}/data">
-      <fileset dir="jsp"/>
-   </copy>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/ivy.xml b/src/plugin/protocol-httpclient/ivy.xml
deleted file mode 100644
index 00b6f07..0000000
--- a/src/plugin/protocol-httpclient/ivy.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-    <dependency org="org.jsoup" name="jsoup" rev="1.8.1" />
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/basic.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/jsp/basic.jsp b/src/plugin/protocol-httpclient/jsp/basic.jsp
deleted file mode 100644
index c5bfb89..0000000
--- a/src/plugin/protocol-httpclient/jsp/basic.jsp
+++ /dev/null
@@ -1,74 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%><%--
-  This JSP demonstrates basic authentication. When this JSP page is
-  requested with no query parameters, then the user must enter the
-  username as 'userx' and password as 'passx' when prompted for
-  authentication. Apart from this there are a few other test cases,
-  which can be used by passing a test case number as query parameter in
-  the following manner: basic.jsp?case=1, basic.jsp?case=2, etc.
-  The credentials for each test case can be easily figured out from the
-  code below.
-
-  Author: Susam Pal
---%><%@ page
-    import = "sun.misc.BASE64Decoder"
-%><%
-  String authHeader = request.getHeader("Authorization");
-  String realm = null;
-  String username = null;
-  String password = null;
-  int testCase = 0;
-  try {
-    testCase = Integer.parseInt(request.getParameter("case"));
-  } catch (Exception ex) {
-    // do nothing
-  }
-  switch (testCase) {
-    case 1:
-      realm = "realm1"; username = "user1"; password = "pass1";
-      break;
-
-    case 2:
-      realm = "realm2"; username = "user2"; password = "pass2";
-      break;
-
-    default:
-      realm = "realmx"; username = "userx"; password = "passx";
-      break;
-  }
-
-  boolean authenticated = false;
-  if (authHeader != null && authHeader.toUpperCase().startsWith("BASIC")) {
-    String creds[] = new String(new BASE64Decoder().decodeBuffer(
-        authHeader.substring(6))).split(":", 2);
-    if (creds[0].equals(username) && creds[1].equals(password))
-          authenticated = true;
-  }
-  if (!authenticated) {
-    response.setHeader("WWW-Authenticate", "Basic realm=\"" + realm + "\"");
-    response.sendError(response.SC_UNAUTHORIZED);
-  } else {
-%>
-<html>
-<head><title>Basic Authentication Test</title></head>
-<body>
-<p>Hi <%= username %>, you have been successfully authenticated.</p>
-</body>
-</html>
-<%
-  }
-%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/cookies.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/jsp/cookies.jsp b/src/plugin/protocol-httpclient/jsp/cookies.jsp
deleted file mode 100644
index ae2ace2..0000000
--- a/src/plugin/protocol-httpclient/jsp/cookies.jsp
+++ /dev/null
@@ -1,63 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%><%--
-  This JSP tests whether the client can remember cookies. When the JSP
-  is fetched for the first time without any query parameters, it sets
-  a few cookies in the client. On a second request, with the query
-  parameter, 'cookie=yes', it checks whether all the client has sent
-  the cookies. If the cookies are found, HTTP 200 response is returned.
-  If the cookies are not found, HTTP 403 response is returned.
-
-  Author: Susam Pal
---%><%
-  String cookieParam = request.getParameter("cookie");
-  if (!"yes".equals(cookieParam)) { // Send cookies
-    response.addCookie(new Cookie("var1", "val1"));
-    response.addCookie(new Cookie("var2", "val2"));
-%>
-<html>
-<head><title>Cookies Set</title></head>
-<body><p>Cookies have been set.</p></body>
-</html>
-<%
-  } else { // Check cookies
-    int cookiesCount = 0;
-
-    Cookie[] cookies = request.getCookies();
-    if (cookies != null) {
-      for (int i = 0; i < cookies.length; i++) {
-        if (cookies[i].getName().equals("var1")
-            && cookies[i].getValue().equals("val1"))
-          cookiesCount++;
-
-        if (cookies[i].getName().equals("var2")
-            && cookies[i].getValue().equals("val2"))
-          cookiesCount++;
-      }
-    }
-
-    if (cookiesCount != 2) {
-      response.sendError(response.SC_FORBIDDEN);
-    } else {
-%>
-<html>
-<head><title>Cookies Found</title></head>
-<body><p>Cookies found!</p></body>
-</html>
-<%
-    }
-  }
-%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/digest.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/jsp/digest.jsp b/src/plugin/protocol-httpclient/jsp/digest.jsp
deleted file mode 100644
index c657484..0000000
--- a/src/plugin/protocol-httpclient/jsp/digest.jsp
+++ /dev/null
@@ -1,68 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%><%--
-  This JSP tests digest authentication. It generates an HTTP response
-  with authorization header for digest authentication and checks the
-  user-name supplied by the client. It does not check the other
-  parameters and hashes as controlled JUnit tests would be performed
-  against this and only the proper submission of credentials need to
-  be tested.
-
-  Author: Susam Pal
---%><%@ page
-    import = "java.util.StringTokenizer"
-    import = "java.util.HashMap"
-%><%
-  String username = "digest_user";
-  String authHeader = request.getHeader("Authorization");
-  
-  boolean authenticated = false;
-  if (authHeader != null && authHeader.toUpperCase().startsWith("DIGEST")) {
-    HashMap map = new HashMap();
-    StringTokenizer tokenizer = new StringTokenizer(
-        authHeader.substring(7).trim(), ",");
-    while (tokenizer.hasMoreTokens()) {
-      String[] param = tokenizer.nextToken().trim().split("=", 2);
-      if (param[1].charAt(0) == '"') {
-        param[1] = param[1].substring(1, param[1].length() - 1);
-      }
-      map.put(param[0], param[1]);
-    }
-
-    if (username.equals((String)map.get("username")))
-      authenticated = true;
-  }
-
-  if (!authenticated) {
-    String realm = "realm=\"realm1\"";
-    String qop   = "qop=\"auth,auth-int\"";
-    String nonce = "nonce=\"dcd98b7102dd2f0e8b11d0f600bfb0c093\"";
-    String opaque = "opaque=\"5ccc069c403ebaf9f0171e9517f40e41\"";
-
-    response.setHeader("WWW-Authenticate", "Digest " + realm + ", "
-        + qop + ", " + nonce + ", " + opaque);
-    response.sendError(response.SC_UNAUTHORIZED);
-  } else {
-%>
-<html>
-<head><title>Digest Authentication Test</title></head>
-<body>
-<p>Hi <%= username %>, you have been successfully authenticated.</p>
-</body>
-</html>
-<%
-  }
-%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/noauth.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/jsp/noauth.jsp b/src/plugin/protocol-httpclient/jsp/noauth.jsp
deleted file mode 100644
index c726b0f..0000000
--- a/src/plugin/protocol-httpclient/jsp/noauth.jsp
+++ /dev/null
@@ -1,36 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%><%--
-  This JSP tests whether the client is sending any pre-emptive
-  authentication headers. The client is expected not to send pre-emptive
-  authentication headers. If such authentication headers are found, this
-  JSP will return an HTTP 403 response; HTTP 200 response otherwise.
-
-  Author: Susam Pal
---%><%
-  if (request.getHeader("Authorization") != null) {
-    response.sendError(response.SC_UNAUTHORIZED);
-  } else {
-%>
-<html>
-<head><title>No authorization headers found</title></head>
-<body>
-<p>No authorization headers found.</p>
-</body>
-</html>
-<%
-  }
-%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/ntlm.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/jsp/ntlm.jsp b/src/plugin/protocol-httpclient/jsp/ntlm.jsp
deleted file mode 100644
index 6ad921e..0000000
--- a/src/plugin/protocol-httpclient/jsp/ntlm.jsp
+++ /dev/null
@@ -1,89 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%><%--
-  This JSP tests NTLM authentication. It generates an HTTP response
-  with authorization header for NTLM authentication and checks the
-  user-name supplied by the client. It does not check the other
-  parameters and hashes as controlled JUnit tests would be performed
-  against this and only the proper submission of credentials need to
-  be tested.
-
-  Author: Susam Pal
---%><%@ page
-    import = "sun.misc.BASE64Decoder"
-    import = "sun.misc.BASE64Encoder"
-%><%
-  String authHeader = request.getHeader("Authorization");
-  String username = null;
-  String domain = null;
-  String host = null;
-
-  boolean authenticated = false;
-  if (authHeader != null && authHeader.startsWith("NTLM")) {
-    byte[] msg = new BASE64Decoder().decodeBuffer(
-        authHeader.substring(5));
-    if (msg[8] == 1) {
-      byte[] type2msg = {
-          'N', 'T', 'L', 'M', 'S', 'S', 'P', 0, // NTLMSSP Signature
-          2, 0, 0, 0,                           // Type 2 Indicator
-          10, 0, 10, 0, 32, 0, 0, 0,            // length, offset
-          0x00, 0x02, (byte) 0x81, 0,           // Flags
-          1, 2, 3, 4, 5, 6, 7, 8,               // Challenge
-          'N', 'U', 'T', 'C', 'H' // NUTCH (Domain)
-      };
-      response.setHeader("WWW-Authenticate", "NTLM "
-          + new BASE64Encoder().encodeBuffer(type2msg));
-      response.sendError(response.SC_UNAUTHORIZED);
-      return;
-    } else if (msg[8] == 3) {
-      int length;
-      int offset;
-
-      // Get domain name
-      length = msg[30] + msg[31] * 256;
-      offset = msg[32] + msg[33] * 256;
-      domain = new String(msg, offset, length);
-
-      // Get user name
-      length = msg[38] + msg[39] * 256;
-      offset = msg[40] + msg[41] * 256;
-      username = new String(msg, offset, length);
-
-      // Get password
-      length = msg[46] + msg[47] * 256;
-      offset = msg[48] + msg[49] * 256;
-      host = new String(msg, offset, length);
-
-      if ("ntlm_user".equalsIgnoreCase(username)
-          && "NUTCH".equalsIgnoreCase(domain))
-        authenticated = true;
-    }
-  }
-
-  if (!authenticated) {
-    response.setHeader("WWW-Authenticate", "NTLM");
-    response.sendError(response.SC_UNAUTHORIZED);
-  } else {
-%>
-<html>
-<head>NTLM Authentication Test</head>
-<body>
-<p>Hi <%= username %>, You have been successfully authenticated.</p>
-</body>
-</html>
-<%
-  }
-%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/plugin.xml b/src/plugin/protocol-httpclient/plugin.xml
deleted file mode 100644
index 1747713..0000000
--- a/src/plugin/protocol-httpclient/plugin.xml
+++ /dev/null
@@ -1,58 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-   
-   http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-<plugin
-   id="protocol-httpclient"
-   name="Http / Https Protocol Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-   
-   <runtime>
-      <library name="protocol-httpclient.jar">
-         <export name="*"/>
-      </library>
-      <library name="jsoup-1.8.1.jar"/>
-   </runtime>
-   
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-      <import plugin="lib-http"/>
-   </requires>
-   
-   <extension id="org.apache.nutch.protocol.httpclient"
-      name="HttpProtocol"
-      point="org.apache.nutch.protocol.Protocol">
-      
-      <implementation id="org.apache.nutch.protocol.httpclient.Http"
-         class="org.apache.nutch.protocol.httpclient.Http">
-         <parameter name="protocolName" value="http"/>
-      </implementation>
-      
-   </extension>
-   
-   <extension id="org.apache.nutch.protocol.https"
-      name="HttpsProtocol"
-      point="org.apache.nutch.protocol.Protocol">
-      
-      <implementation id="org.apache.nutch.protocol.httpclient.Http"
-         class="org.apache.nutch.protocol.httpclient.Http">
-         <parameter name="protocolName" value="https"/>
-      </implementation>
-      
-   </extension>
-   
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
deleted file mode 100644
index afcf24a..0000000
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * Based on EasySSLProtocolSocketFactory from commons-httpclient:
- * 
- * $Header:
- * /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/contrib/org/apache/commons/httpclient/contrib/ssl/DummySSLProtocolSocketFactory.java,v
- * 1.7 2004/06/11 19:26:27 olegk Exp $ $Revision$ $Date: 2005-02-26 05:01:52
- * -0800 (Sat, 26 Feb 2005) $
- */
-
-package org.apache.nutch.protocol.httpclient;
-
-import java.io.IOException;
-import java.net.InetAddress;
-import java.net.Socket;
-import java.net.UnknownHostException;
-
-import org.apache.commons.httpclient.ConnectTimeoutException;
-import org.apache.commons.httpclient.HttpClientError;
-import org.apache.commons.httpclient.params.HttpConnectionParams;
-import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory;
-import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import javax.net.ssl.SSLContext;
-import javax.net.ssl.TrustManager;
-
-public class DummySSLProtocolSocketFactory implements
-    SecureProtocolSocketFactory {
-
-  /** Logger object for this class. */
-  private static final Logger LOG = LoggerFactory
-      .getLogger(DummySSLProtocolSocketFactory.class);
-
-  private SSLContext sslcontext = null;
-
-  /**
-   * Constructor for DummySSLProtocolSocketFactory.
-   */
-  public DummySSLProtocolSocketFactory() {
-    super();
-  }
-
-  private static SSLContext createEasySSLContext() {
-    try {
-      SSLContext context = SSLContext.getInstance("SSL");
-      context.init(null,
-          new TrustManager[] { new DummyX509TrustManager(null) }, null);
-      return context;
-    } catch (Exception e) {
-      if (LOG.isErrorEnabled()) {
-        LOG.error(e.getMessage(), e);
-      }
-      throw new HttpClientError(e.toString());
-    }
-  }
-
-  private SSLContext getSSLContext() {
-    if (this.sslcontext == null) {
-      this.sslcontext = createEasySSLContext();
-    }
-    return this.sslcontext;
-  }
-
-  /**
-   * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int,InetAddress,int)
-   */
-  public Socket createSocket(String host, int port, InetAddress clientHost,
-      int clientPort) throws IOException, UnknownHostException {
-
-    return getSSLContext().getSocketFactory().createSocket(host, port,
-        clientHost, clientPort);
-  }
-
-  /**
-   * Attempts to get a new socket connection to the given host within the given
-   * time limit.
-   * <p>
-   * To circumvent the limitations of older JREs that do not support connect
-   * timeout a controller thread is executed. The controller thread attempts to
-   * create a new socket within the given limit of time. If socket constructor
-   * does not return until the timeout expires, the controller terminates and
-   * throws an {@link ConnectTimeoutException}
-   * </p>
-   * 
-   * @param host
-   *          the host name/IP
-   * @param port
-   *          the port on the host
-   * @param localAddress
-   *          the local host name/IP to bind the socket to
-   * @param localPort
-   *          the port on the local machine
-   * @param params
-   *          {@link HttpConnectionParams Http connection parameters}
-   * 
-   * @return Socket a new socket
-   * 
-   * @throws IOException
-   *           if an I/O error occurs while creating the socket
-   * @throws UnknownHostException
-   *           if the IP address of the host cannot be determined
-   */
-  public Socket createSocket(final String host, final int port,
-      final InetAddress localAddress, final int localPort,
-      final HttpConnectionParams params) throws IOException,
-      UnknownHostException, ConnectTimeoutException {
-    if (params == null) {
-      throw new IllegalArgumentException("Parameters may not be null");
-    }
-    int timeout = params.getConnectionTimeout();
-    if (timeout == 0) {
-      return createSocket(host, port, localAddress, localPort);
-    } else {
-      // To be eventually deprecated when migrated to Java 1.4 or above
-      return ControllerThreadSocketFactory.createSocket(this, host, port,
-          localAddress, localPort, timeout);
-    }
-  }
-
-  /**
-   * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int)
-   */
-  public Socket createSocket(String host, int port) throws IOException,
-      UnknownHostException {
-    return getSSLContext().getSocketFactory().createSocket(host, port);
-  }
-
-  /**
-   * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(Socket,String,int,boolean)
-   */
-  public Socket createSocket(Socket socket, String host, int port,
-      boolean autoClose) throws IOException, UnknownHostException {
-    return getSSLContext().getSocketFactory().createSocket(socket, host, port,
-        autoClose);
-  }
-
-  public boolean equals(Object obj) {
-    return ((obj != null) && obj.getClass().equals(
-        DummySSLProtocolSocketFactory.class));
-  }
-
-  public int hashCode() {
-    return DummySSLProtocolSocketFactory.class.hashCode();
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
deleted file mode 100644
index b5509cc..0000000
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * Based on EasyX509TrustManager from commons-httpclient.
- */
-
-package org.apache.nutch.protocol.httpclient;
-
-import java.security.KeyStore;
-import java.security.KeyStoreException;
-import java.security.NoSuchAlgorithmException;
-import java.security.cert.CertificateException;
-import java.security.cert.X509Certificate;
-
-import javax.net.ssl.TrustManagerFactory;
-import javax.net.ssl.TrustManager;
-import javax.net.ssl.X509TrustManager;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class DummyX509TrustManager implements X509TrustManager {
-  private X509TrustManager standardTrustManager = null;
-
-  /** Logger object for this class. */
-  private static final Logger LOG = LoggerFactory
-      .getLogger(DummyX509TrustManager.class);
-
-  /**
-   * Constructor for DummyX509TrustManager.
-   */
-  public DummyX509TrustManager(KeyStore keystore)
-      throws NoSuchAlgorithmException, KeyStoreException {
-    super();
-    String algo = TrustManagerFactory.getDefaultAlgorithm();
-    TrustManagerFactory factory = TrustManagerFactory.getInstance(algo);
-    factory.init(keystore);
-    TrustManager[] trustmanagers = factory.getTrustManagers();
-    if (trustmanagers.length == 0) {
-      throw new NoSuchAlgorithmException(algo + " trust manager not supported");
-    }
-    this.standardTrustManager = (X509TrustManager) trustmanagers[0];
-  }
-
-  /**
-   * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
-   *      String)
-   */
-  public boolean isClientTrusted(X509Certificate[] certificates) {
-    return true;
-  }
-
-  /**
-   * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
-   *      String)
-   */
-  public boolean isServerTrusted(X509Certificate[] certificates) {
-    return true;
-  }
-
-  /**
-   * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
-   */
-  public X509Certificate[] getAcceptedIssuers() {
-    return this.standardTrustManager.getAcceptedIssuers();
-  }
-
-  public void checkClientTrusted(X509Certificate[] arg0, String arg1)
-      throws CertificateException {
-    // do nothing
-
-  }
-
-  public void checkServerTrusted(X509Certificate[] arg0, String arg1)
-      throws CertificateException {
-    // do nothing
-
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
deleted file mode 100644
index 75506ce..0000000
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
+++ /dev/null
@@ -1,572 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.httpclient;
-
-// JDK imports
-import java.io.InputStream;
-import java.io.IOException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-
-import org.xml.sax.SAXException;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
-import org.w3c.dom.Node;
-
-// Slf4j Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-// HTTP Client imports
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.HostConfiguration;
-import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
-import org.apache.commons.httpclient.NTCredentials;
-import org.apache.commons.httpclient.auth.AuthScope;
-import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
-import org.apache.commons.httpclient.protocol.Protocol;
-import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
-// NUTCH-1929 Consider implementing dependency injection for crawl HTTPS sites that use self signed certificates
-//import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;
-
-import org.apache.commons.lang.StringUtils;
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- * <p>
- * This class is a protocol plugin that configures an HTTP client for Basic,
- * Digest and NTLM authentication schemes for web server as well as proxy
- * server. It takes care of HTTPS protocol as well as cookies in a single fetch
- * session.
- * </p>
- * <p>
- * Documentation can be found on the Nutch <a
- * href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes"
- * >HttpAuthenticationSchemes</a> wiki page.
- * </p>
- * <p>
- * The original description of the motivation to support <a
- * href="https://wiki.apache.org/nutch/HttpPostAuthentication"
- * >HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally
- * HttpPostAuthentication development is documented at the <a
- * href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira
- * issue.
- * 
- * @author Susam Pal
- */
-public class Http extends HttpBase {
-
-  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
-
-  private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
-
-  // Since the Configuration has not yet been set,
-  // then an unconfigured client is returned.
-  private static HttpClient client = new HttpClient(connectionManager);
-  private static String defaultUsername;
-  private static String defaultPassword;
-  private static String defaultRealm;
-  private static String defaultScheme;
-  private static String authFile;
-  private static String agentHost;
-  private static boolean authRulesRead = false;
-  private static Configuration conf;
-
-  private int maxThreadsTotal = 10;
-
-  private String proxyUsername;
-  private String proxyPassword;
-  private String proxyRealm;
-
-  private static HttpFormAuthConfigurer formConfigurer;
-
-  /**
-   * Returns the configured HTTP client.
-   * 
-   * @return HTTP client
-   */
-  static synchronized HttpClient getClient() {
-    return client;
-  }
-
-  /**
-   * Constructs this plugin.
-   */
-  public Http() {
-    super(LOG);
-  }
-
-  /**
-   * Reads the configuration from the Nutch configuration files and sets the
-   * configuration.
-   * 
-   * @param conf
-   *          Configuration
-   */
-  public void setConf(Configuration conf) {
-    super.setConf(conf);
-    this.conf = conf;
-    this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
-    this.proxyUsername = conf.get("http.proxy.username", "");
-    this.proxyPassword = conf.get("http.proxy.password", "");
-    this.proxyRealm = conf.get("http.proxy.realm", "");
-    agentHost = conf.get("http.agent.host", "");
-    authFile = conf.get("http.auth.file", "");
-    configureClient();
-    try {
-      setCredentials();
-    } catch (Exception ex) {
-      if (LOG.isErrorEnabled()) {
-        LOG.error("Could not read " + authFile + " : " + ex.getMessage());
-      }
-    }
-  }
-
-  /**
-   * Main method.
-   * 
-   * @param args
-   *          Command line arguments
-   */
-  public static void main(String[] args) throws Exception {
-    Http http = new Http();
-    http.setConf(NutchConfiguration.create());
-    main(http, args);
-  }
-
-  /**
-   * Fetches the <code>url</code> with a configured HTTP client and gets the
-   * response.
-   * 
-   * @param url
-   *          URL to be fetched
-   * @param datum
-   *          Crawl data
-   * @param redirect
-   *          Follow redirects if and only if true
-   * @return HTTP response
-   */
-  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
-      throws ProtocolException, IOException {
-    resolveCredentials(url);
-    return new HttpResponse(this, url, datum, redirect);
-  }
-
-  /**
-   * Configures the HTTP client
-   */
-  private void configureClient() {
-
-    // Set up an HTTPS socket factory that accepts self-signed certs.
-    // ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
-    ProtocolSocketFactory factory = new DummySSLProtocolSocketFactory();
-    Protocol https = new Protocol("https", factory, 443);
-    Protocol.registerProtocol("https", https);
-
-    HttpConnectionManagerParams params = connectionManager.getParams();
-    params.setConnectionTimeout(timeout);
-    params.setSoTimeout(timeout);
-    params.setSendBufferSize(BUFFER_SIZE);
-    params.setReceiveBufferSize(BUFFER_SIZE);
-
-    // --------------------------------------------------------------------------------
-    // NUTCH-1836: Modification to increase the number of available connections
-    // for multi-threaded crawls.
-    // --------------------------------------------------------------------------------
-    params.setMaxTotalConnections(conf.getInt(
-        "mapred.tasktracker.map.tasks.maximum", 5)
-        * conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
-
-    // Also set max connections per host to maxThreadsTotal since all threads
-    // might be used to fetch from the same host - otherwise timeout errors can
-    // occur
-    params.setDefaultMaxConnectionsPerHost(conf.getInt(
-        "fetcher.threads.fetch", maxThreadsTotal));
-
-    // executeMethod(HttpMethod) seems to ignore the connection timeout on the
-    // connection manager.
-    // set it explicitly on the HttpClient.
-    client.getParams().setConnectionManagerTimeout(timeout);
-
-    HostConfiguration hostConf = client.getHostConfiguration();
-    ArrayList<Header> headers = new ArrayList<Header>();
-    // Set the User Agent in the header
-    // headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941
-    // prefer English
-    headers.add(new Header("Accept-Language", acceptLanguage));
-    // prefer UTF-8
-    headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
-    // prefer understandable formats
-    headers
-        .add(new Header(
-            "Accept",
-            "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
-    // accept gzipped content
-    headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
-    hostConf.getParams().setParameter("http.default-headers", headers);
-
-    // HTTP proxy server details
-    if (useProxy) {
-      hostConf.setProxy(proxyHost, proxyPort);
-
-      if (proxyUsername.length() > 0) {
-
-        AuthScope proxyAuthScope = getAuthScope(this.proxyHost, this.proxyPort,
-            this.proxyRealm);
-
-        NTCredentials proxyCredentials = new NTCredentials(this.proxyUsername,
-            this.proxyPassword, Http.agentHost, this.proxyRealm);
-
-        client.getState().setProxyCredentials(proxyAuthScope, proxyCredentials);
-      }
-    }
-
-  }
-
-  /**
-   * Reads authentication configuration file (defined as 'http.auth.file' in
-   * Nutch configuration file) and sets the credentials for the configured
-   * authentication scopes in the HTTP client object.
-   * 
-   * @throws ParserConfigurationException
-   *           If a document builder can not be created.
-   * @throws SAXException
-   *           If any parsing error occurs.
-   * @throws IOException
-   *           If any I/O error occurs.
-   */
-  private static synchronized void setCredentials()
-      throws ParserConfigurationException, SAXException, IOException {
-
-    if (authRulesRead)
-      return;
-
-    authRulesRead = true; // Avoid re-attempting to read
-
-    InputStream is = conf.getConfResourceAsInputStream(authFile);
-    if (is != null) {
-      Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
-          .parse(is);
-
-      Element rootElement = doc.getDocumentElement();
-      if (!"auth-configuration".equals(rootElement.getTagName())) {
-        if (LOG.isWarnEnabled())
-          LOG.warn("Bad auth conf file: root element <"
-              + rootElement.getTagName() + "> found in " + authFile
-              + " - must be <auth-configuration>");
-      }
-
-      // For each set of credentials
-      NodeList credList = rootElement.getChildNodes();
-      for (int i = 0; i < credList.getLength(); i++) {
-        Node credNode = credList.item(i);
-        if (!(credNode instanceof Element))
-          continue;
-
-        Element credElement = (Element) credNode;
-        if (!"credentials".equals(credElement.getTagName())) {
-          if (LOG.isWarnEnabled())
-            LOG.warn("Bad auth conf file: Element <" + credElement.getTagName()
-                + "> not recognized in " + authFile
-                + " - expected <credentials>");
-          continue;
-        }
-
-        String authMethod = credElement.getAttribute("authMethod");
-        // read http form post auth info
-        if (StringUtils.isNotBlank(authMethod)) {
-          formConfigurer = readFormAuthConfigurer(credElement, authMethod);
-          continue;
-        }
-
-        String username = credElement.getAttribute("username");
-        String password = credElement.getAttribute("password");
-
-        // For each authentication scope
-        NodeList scopeList = credElement.getChildNodes();
-        for (int j = 0; j < scopeList.getLength(); j++) {
-          Node scopeNode = scopeList.item(j);
-          if (!(scopeNode instanceof Element))
-            continue;
-
-          Element scopeElement = (Element) scopeNode;
-
-          if ("default".equals(scopeElement.getTagName())) {
-
-            // Determine realm and scheme, if any
-            String realm = scopeElement.getAttribute("realm");
-            String scheme = scopeElement.getAttribute("scheme");
-
-            // Set default credentials
-            defaultUsername = username;
-            defaultPassword = password;
-            defaultRealm = realm;
-            defaultScheme = scheme;
-
-            if (LOG.isTraceEnabled()) {
-              LOG.trace("Credentials - username: " + username
-                  + "; set as default" + " for realm: " + realm + "; scheme: "
-                  + scheme);
-            }
-
-          } else if ("authscope".equals(scopeElement.getTagName())) {
-
-            // Determine authentication scope details
-            String host = scopeElement.getAttribute("host");
-            int port = -1; // For setting port to AuthScope.ANY_PORT
-            try {
-              port = Integer.parseInt(scopeElement.getAttribute("port"));
-            } catch (Exception ex) {
-              // do nothing, port is already set to any port
-            }
-            String realm = scopeElement.getAttribute("realm");
-            String scheme = scopeElement.getAttribute("scheme");
-
-            // Set credentials for the determined scope
-            AuthScope authScope = getAuthScope(host, port, realm, scheme);
-            NTCredentials credentials = new NTCredentials(username, password,
-                agentHost, realm);
-
-            client.getState().setCredentials(authScope, credentials);
-
-            if (LOG.isTraceEnabled()) {
-              LOG.trace("Credentials - username: " + username
-                  + "; set for AuthScope - " + "host: " + host + "; port: "
-                  + port + "; realm: " + realm + "; scheme: " + scheme);
-            }
-
-          } else {
-            if (LOG.isWarnEnabled())
-              LOG.warn("Bad auth conf file: Element <"
-                  + scopeElement.getTagName() + "> not recognized in "
-                  + authFile + " - expected <authscope>");
-          }
-        }
-        is.close();
-      }
-    }
-  }
-
-  /**
-   * <auth-configuration> <credentials authMethod="formAuth" loginUrl="loginUrl"
-   * loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field
-   * name="username" value="user1"/> </loginPostData> <additionalPostHeaders>
-   * <field name="header1" value="vaule1"/> </additionalPostHeaders>
-   * <removedFormFields> <field name="header1"/> </removedFormFields>
-   * </credentials> </auth-configuration>
-   */
-  private static HttpFormAuthConfigurer readFormAuthConfigurer(
-      Element credElement, String authMethod) {
-    if ("formAuth".equals(authMethod)) {
-      HttpFormAuthConfigurer formConfigurer = new HttpFormAuthConfigurer();
-
-      String str = credElement.getAttribute("loginUrl");
-      if (StringUtils.isNotBlank(str)) {
-        formConfigurer.setLoginUrl(str.trim());
-      } else {
-        throw new IllegalArgumentException("Must set loginUrl.");
-      }
-      str = credElement.getAttribute("loginFormId");
-      if (StringUtils.isNotBlank(str)) {
-        formConfigurer.setLoginFormId(str.trim());
-      } else {
-        throw new IllegalArgumentException("Must set loginFormId.");
-      }
-      str = credElement.getAttribute("loginRedirect");
-      if (StringUtils.isNotBlank(str)) {
-        formConfigurer.setLoginRedirect(Boolean.parseBoolean(str));
-      }
-
-      NodeList nodeList = credElement.getChildNodes();
-      for (int j = 0; j < nodeList.getLength(); j++) {
-        Node node = nodeList.item(j);
-        if (!(node instanceof Element))
-          continue;
-
-        Element element = (Element) node;
-        if ("loginPostData".equals(element.getTagName())) {
-          Map<String, String> loginPostData = new HashMap<String, String>();
-          NodeList childNodes = element.getChildNodes();
-          for (int k = 0; k < childNodes.getLength(); k++) {
-            Node fieldNode = childNodes.item(k);
-            if (!(fieldNode instanceof Element))
-              continue;
-
-            Element fieldElement = (Element) fieldNode;
-            String name = fieldElement.getAttribute("name");
-            String value = fieldElement.getAttribute("value");
-            loginPostData.put(name, value);
-          }
-          formConfigurer.setLoginPostData(loginPostData);
-        } else if ("additionalPostHeaders".equals(element.getTagName())) {
-          Map<String, String> additionalPostHeaders = new HashMap<String, String>();
-          NodeList childNodes = element.getChildNodes();
-          for (int k = 0; k < childNodes.getLength(); k++) {
-            Node fieldNode = childNodes.item(k);
-            if (!(fieldNode instanceof Element))
-              continue;
-
-            Element fieldElement = (Element) fieldNode;
-            String name = fieldElement.getAttribute("name");
-            String value = fieldElement.getAttribute("value");
-            additionalPostHeaders.put(name, value);
-          }
-          formConfigurer.setAdditionalPostHeaders(additionalPostHeaders);
-        } else if ("removedFormFields".equals(element.getTagName())) {
-          Set<String> removedFormFields = new HashSet<String>();
-          NodeList childNodes = element.getChildNodes();
-          for (int k = 0; k < childNodes.getLength(); k++) {
-            Node fieldNode = childNodes.item(k);
-            if (!(fieldNode instanceof Element))
-              continue;
-
-            Element fieldElement = (Element) fieldNode;
-            String name = fieldElement.getAttribute("name");
-            removedFormFields.add(name);
-          }
-          formConfigurer.setRemovedFormFields(removedFormFields);
-        }
-      }
-
-      return formConfigurer;
-    } else {
-      throw new IllegalArgumentException("Unsupported authMethod: "
-          + authMethod);
-    }
-  }
-
-  /**
-   * If credentials for the authentication scope determined from the specified
-   * <code>url</code> is not already set in the HTTP client, then this method
-   * sets the default credentials to fetch the specified <code>url</code>. If
-   * credentials are found for the authentication scope, the method returns
-   * without altering the client.
-   * 
-   * @param url
-   *          URL to be fetched
-   */
-  private void resolveCredentials(URL url) {
-
-    if (formConfigurer != null) {
-      HttpFormAuthentication formAuther = new HttpFormAuthentication(
-          formConfigurer, client, this);
-      try {
-        formAuther.login();
-      } catch (Exception e) {
-        throw new RuntimeException(e);
-      }
-
-      return;
-    }
-
-    if (defaultUsername != null && defaultUsername.length() > 0) {
-
-      int port = url.getPort();
-      if (port == -1) {
-        if ("https".equals(url.getProtocol()))
-          port = 443;
-        else
-          port = 80;
-      }
-
-      AuthScope scope = new AuthScope(url.getHost(), port);
-
-      if (client.getState().getCredentials(scope) != null) {
-        if (LOG.isTraceEnabled())
-          LOG.trace("Pre-configured credentials with scope - host: "
-              + url.getHost() + "; port: " + port + "; found for url: " + url);
-
-        // Credentials are already configured, so do nothing and return
-        return;
-      }
-
-      if (LOG.isTraceEnabled())
-        LOG.trace("Pre-configured credentials with scope -  host: "
-            + url.getHost() + "; port: " + port + "; not found for url: " + url);
-
-      AuthScope serverAuthScope = getAuthScope(url.getHost(), port,
-          defaultRealm, defaultScheme);
-
-      NTCredentials serverCredentials = new NTCredentials(defaultUsername,
-          defaultPassword, agentHost, defaultRealm);
-
-      client.getState().setCredentials(serverAuthScope, serverCredentials);
-    }
-  }
-
-  /**
-   * Returns an authentication scope for the specified <code>host</code>,
-   * <code>port</code>, <code>realm</code> and <code>scheme</code>.
-   * 
-   * @param host
-   *          Host name or address.
-   * @param port
-   *          Port number.
-   * @param realm
-   *          Authentication realm.
-   * @param scheme
-   *          Authentication scheme.
-   */
-  private static AuthScope getAuthScope(String host, int port, String realm,
-      String scheme) {
-
-    if (host.length() == 0)
-      host = null;
-
-    if (port < 0)
-      port = -1;
-
-    if (realm.length() == 0)
-      realm = null;
-
-    if (scheme.length() == 0)
-      scheme = null;
-
-    return new AuthScope(host, port, realm, scheme);
-  }
-
-  /**
-   * Returns an authentication scope for the specified <code>host</code>,
-   * <code>port</code> and <code>realm</code>.
-   * 
-   * @param host
-   *          Host name or address.
-   * @param port
-   *          Port number.
-   * @param realm
-   *          Authentication realm.
-   */
-  private static AuthScope getAuthScope(String host, int port, String realm) {
-
-    return getAuthScope(host, port, realm, "");
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
deleted file mode 100644
index 54dc905..0000000
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.httpclient;
-
-import java.util.List;
-
-/**
- * The base level of services required for Http Authentication
- * 
- * @see HttpAuthenticationFactory
- * 
- * @author Matt Tencati
- */
-public interface HttpAuthentication {
-
-  /**
-   * Gets the credentials generated by the HttpAuthentication object. May return
-   * null.
-   * 
-   * @return The credentials value
-   */
-  public List<String> getCredentials();
-
-  /**
-   * Gets the realm used by the HttpAuthentication object during creation.
-   * 
-   * @return The realm value
-   */
-  public String getRealm();
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
deleted file mode 100644
index daff5ec..0000000
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.httpclient;
-
-/**
- * Can be used to identify problems during creation of Authentication objects.
- * In the future it may be used as a method of collecting authentication
- * failures during Http protocol transfer in order to present the user with
- * credentials required during a future fetch.
- * 
- * @author Matt Tencati
- */
-public class HttpAuthenticationException extends Exception {
-
-  /**
-   * Constructs a new exception with null as its detail message.
-   */
-  public HttpAuthenticationException() {
-    super();
-  }
-
-  /**
-   * Constructs a new exception with the specified detail message.
-   * 
-   * @param message
-   *          the detail message. The detail message is saved for later
-   *          retrieval by the {@link Throwable#getMessage()} method.
-   */
-  public HttpAuthenticationException(String message) {
-    super(message);
-  }
-
-  /**
-   * Constructs a new exception with the specified message and cause.
-   * 
-   * @param message
-   *          the detail message. The detail message is saved for later
-   *          retrieval by the {@link Throwable#getMessage()} method.
-   * @param cause
-   *          the cause (use {@link #getCause()} to retrieve the cause)
-   */
-  public HttpAuthenticationException(String message, Throwable cause) {
-    super(message, cause);
-  }
-
-  /**
-   * Constructs a new exception with the specified cause and detail message from
-   * given clause if it is not null.
-   * 
-   * @param cause
-   *          the cause (use {@link #getCause()} to retrieve the cause)
-   */
-  public HttpAuthenticationException(Throwable cause) {
-    super(cause);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
deleted file mode 100644
index 064a6d0..0000000
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.httpclient;
-
-// JDK imports
-import java.util.ArrayList;
-import java.util.Collection;
-
-// Slf4j Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configurable;
-
-// Nutch imports
-import org.apache.nutch.metadata.Metadata;
-
-/**
- * Provides the Http protocol implementation with the ability to authenticate
- * when prompted. The goal is to provide multiple authentication types but for
- * now just the {@link HttpBasicAuthentication} authentication type is provided.
- * 
- * @see HttpBasicAuthentication
- * @see Http
- * @see HttpResponse
- * 
- * @author Matt Tencati
- */
-public class HttpAuthenticationFactory implements Configurable {
-
-  /**
-   * The HTTP Authentication (WWW-Authenticate) header which is returned by a
-   * webserver requiring authentication.
-   */
-  public static final String WWW_AUTHENTICATE = "WWW-Authenticate";
-
-  public static final Logger LOG = LoggerFactory
-      .getLogger(HttpAuthenticationFactory.class);
-
-  private Configuration conf = null;
-
-  public HttpAuthenticationFactory(Configuration conf) {
-    setConf(conf);
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return conf;
-  }
-
-  public HttpAuthentication findAuthentication(Metadata header) {
-
-    if (header == null)
-      return null;
-
-    try {
-      Collection<String> challenge = new ArrayList<String>();
-      challenge.add(header.get(WWW_AUTHENTICATE));
-
-      for (String challengeString : challenge) {
-        if (challengeString.equals("NTLM"))
-          challengeString = "Basic realm=techweb";
-
-        if (LOG.isTraceEnabled())
-          LOG.trace("Checking challengeString=" + challengeString);
-
-        HttpAuthentication auth = HttpBasicAuthentication.getAuthentication(
-            challengeString, conf);
-        if (auth != null)
-          return auth;
-
-        // TODO Add additional Authentication lookups here
-      }
-    } catch (Exception e) {
-      LOG.error("Error: ", e);
-    }
-    return null;
-  }
-}