You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:48 UTC
[04/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
deleted file mode 100644
index f6d7e4d..0000000
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ /dev/null
@@ -1,558 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.http;
-
-import java.io.BufferedInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.PushbackInputStream;
-import java.net.InetSocketAddress;
-import java.net.Socket;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.net.ssl.SSLSocket;
-import javax.net.ssl.SSLSocketFactory;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.metadata.SpellCheckedMetadata;
-import org.apache.nutch.net.protocols.HttpDateFormat;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.nutch.protocol.http.api.HttpException;
-
-/**
- * An HTTP response.
- */
-public class HttpResponse implements Response {
-
- private Configuration conf;
- private HttpBase http;
- private URL url;
- private String orig;
- private String base;
- private byte[] content;
- private int code;
- private Metadata headers = new SpellCheckedMetadata();
- // used for storing the http headers verbatim
- private StringBuffer httpHeaders;
-
- protected enum Scheme {
- HTTP, HTTPS,
- }
-
- /**
- * Default public constructor.
- *
- * @param http
- * @param url
- * @param datum
- * @throws ProtocolException
- * @throws IOException
- */
- public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
- throws ProtocolException, IOException {
-
- this.http = http;
- this.url = url;
- this.orig = url.toString();
- this.base = url.toString();
-
- Scheme scheme = null;
-
- if ("http".equals(url.getProtocol())) {
- scheme = Scheme.HTTP;
- } else if ("https".equals(url.getProtocol())) {
- scheme = Scheme.HTTPS;
- } else {
- throw new HttpException("Unknown scheme (not http/https) for url:" + url);
- }
-
- if (Http.LOG.isTraceEnabled()) {
- Http.LOG.trace("fetching " + url);
- }
-
- String path = "".equals(url.getFile()) ? "/" : url.getFile();
-
- // some servers will redirect a request with a host line like
- // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
- // don't want the :80...
-
- String host = url.getHost();
- int port;
- String portString;
- if (url.getPort() == -1) {
- if (scheme == Scheme.HTTP) {
- port = 80;
- } else {
- port = 443;
- }
- portString = "";
- } else {
- port = url.getPort();
- portString = ":" + port;
- }
- Socket socket = null;
-
- try {
- socket = new Socket(); // create the socket
- socket.setSoTimeout(http.getTimeout());
-
- // connect
- String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
- int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
- InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
- socket.connect(sockAddr, http.getTimeout());
-
- if (scheme == Scheme.HTTPS) {
- SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
- .getDefault();
- SSLSocket sslsocket = (SSLSocket) factory
- .createSocket(socket, sockHost, sockPort, true);
- sslsocket.setUseClientMode(true);
-
- // Get the protocols and ciphers supported by this JVM
- Set<String> protocols = new HashSet<String>(
- Arrays.asList(sslsocket.getSupportedProtocols()));
- Set<String> ciphers = new HashSet<String>(
- Arrays.asList(sslsocket.getSupportedCipherSuites()));
-
- // Intersect with preferred protocols and ciphers
- protocols.retainAll(http.getTlsPreferredProtocols());
- ciphers.retainAll(http.getTlsPreferredCipherSuites());
-
- sslsocket.setEnabledProtocols(
- protocols.toArray(new String[protocols.size()]));
- sslsocket.setEnabledCipherSuites(
- ciphers.toArray(new String[ciphers.size()]));
-
- sslsocket.startHandshake();
- socket = sslsocket;
- }
-
- this.conf = http.getConf();
- if (sockAddr != null
- && conf.getBoolean("store.ip.address", false) == true) {
- headers.add("_ip_", sockAddr.getAddress().getHostAddress());
- }
-
- // make request
- OutputStream req = socket.getOutputStream();
-
- StringBuffer reqStr = new StringBuffer("GET ");
- if (http.useProxy(url)) {
- reqStr.append(url.getProtocol() + "://" + host + portString + path);
- } else {
- reqStr.append(path);
- }
-
- reqStr.append(" HTTP/1.0\r\n");
-
- reqStr.append("Host: ");
- reqStr.append(host);
- reqStr.append(portString);
- reqStr.append("\r\n");
-
- reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
-
- String userAgent = http.getUserAgent();
- if ((userAgent == null) || (userAgent.length() == 0)) {
- if (Http.LOG.isErrorEnabled()) {
- Http.LOG.error("User-agent is not set!");
- }
- } else {
- reqStr.append("User-Agent: ");
- reqStr.append(userAgent);
- reqStr.append("\r\n");
- }
-
- reqStr.append("Accept-Language: ");
- reqStr.append(this.http.getAcceptLanguage());
- reqStr.append("\r\n");
-
- reqStr.append("Accept: ");
- reqStr.append(this.http.getAccept());
- reqStr.append("\r\n");
-
- if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
- reqStr.append("If-Modified-Since: " + HttpDateFormat
- .toString(datum.getModifiedTime()));
- reqStr.append("\r\n");
- }
- reqStr.append("\r\n");
-
- // store the request in the metadata?
- if (conf.getBoolean("store.http.request", false) == true) {
- headers.add("_request_", reqStr.toString());
- }
-
- byte[] reqBytes = reqStr.toString().getBytes();
-
- req.write(reqBytes);
- req.flush();
-
- PushbackInputStream in = // process response
- new PushbackInputStream(
- new BufferedInputStream(socket.getInputStream(),
- Http.BUFFER_SIZE), Http.BUFFER_SIZE);
-
- StringBuffer line = new StringBuffer();
-
- // store the http headers verbatim
- if (conf.getBoolean("store.http.headers", false) == true) {
- httpHeaders = new StringBuffer();
- }
-
- headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
-
- boolean haveSeenNonContinueStatus = false;
- while (!haveSeenNonContinueStatus) {
- // parse status code line
- this.code = parseStatusLine(in, line);
- if (httpHeaders != null)
- httpHeaders.append(line).append("\n");
- // parse headers
- parseHeaders(in, line, httpHeaders);
- haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
- }
-
- String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
- if (transferEncoding != null && "chunked"
- .equalsIgnoreCase(transferEncoding.trim())) {
- readChunkedContent(in, line);
- } else {
- readPlainContent(in);
- }
-
- String contentEncoding = getHeader(Response.CONTENT_ENCODING);
- if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
- content = http.processGzipEncoded(content, url);
- } else if ("deflate".equals(contentEncoding)) {
- content = http.processDeflateEncoded(content, url);
- } else {
- // store the headers verbatim only if the response was not compressed
- // as the content length reported with not match otherwise
- if (httpHeaders != null) {
- headers.add("_response.headers_", httpHeaders.toString());
- }
- if (Http.LOG.isTraceEnabled()) {
- Http.LOG.trace("fetched " + content.length + " bytes from " + url);
- }
- }
-
- } finally {
- if (socket != null)
- socket.close();
- }
-
- }
-
- /*
- * ------------------------- * <implementation:Response> *
- * -------------------------
- */
-
- public URL getUrl() {
- return url;
- }
-
- public int getCode() {
- return code;
- }
-
- public String getHeader(String name) {
- return headers.get(name);
- }
-
- public Metadata getHeaders() {
- return headers;
- }
-
- public byte[] getContent() {
- return content;
- }
-
- /*
- * ------------------------- * <implementation:Response> *
- * -------------------------
- */
-
- private void readPlainContent(InputStream in)
- throws HttpException, IOException {
-
- int contentLength = Integer.MAX_VALUE; // get content length
- String contentLengthString = headers.get(Response.CONTENT_LENGTH);
- if (contentLengthString != null) {
- contentLengthString = contentLengthString.trim();
- try {
- if (!contentLengthString.isEmpty())
- contentLength = Integer.parseInt(contentLengthString);
- } catch (NumberFormatException e) {
- throw new HttpException("bad content length: " + contentLengthString);
- }
- }
- if (http.getMaxContent() >= 0 && contentLength > http
- .getMaxContent()) // limit
- // download
- // size
- contentLength = http.getMaxContent();
-
- ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
- byte[] bytes = new byte[Http.BUFFER_SIZE];
- int length = 0;
-
- // do not try to read if the contentLength is 0
- if (contentLength == 0) {
- content = new byte[0];
- return;
- }
-
- // read content
- int i = in.read(bytes);
- while (i != -1) {
- out.write(bytes, 0, i);
- length += i;
- if (length >= contentLength) {
- break;
- }
- if ((length + Http.BUFFER_SIZE) > contentLength) {
- // reading next chunk may hit contentLength,
- // must limit number of bytes read
- i = in.read(bytes, 0, (contentLength - length));
- } else {
- i = in.read(bytes);
- }
- }
- content = out.toByteArray();
- }
-
- /**
- * @param in
- * @param line
- * @throws HttpException
- * @throws IOException
- */
- private void readChunkedContent(PushbackInputStream in, StringBuffer line)
- throws HttpException, IOException {
- boolean doneChunks = false;
- int contentBytesRead = 0;
- byte[] bytes = new byte[Http.BUFFER_SIZE];
- ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
-
- while (!doneChunks) {
- if (Http.LOG.isTraceEnabled()) {
- Http.LOG.trace("Http: starting chunk");
- }
-
- readLine(in, line, false);
-
- String chunkLenStr;
- // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'");
- // }
-
- int pos = line.indexOf(";");
- if (pos < 0) {
- chunkLenStr = line.toString();
- } else {
- chunkLenStr = line.substring(0, pos);
- // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " +
- // line.substring(pos+1)); }
- }
- chunkLenStr = chunkLenStr.trim();
- int chunkLen;
- try {
- chunkLen = Integer.parseInt(chunkLenStr, 16);
- } catch (NumberFormatException e) {
- throw new HttpException("bad chunk length: " + line.toString());
- }
-
- if (chunkLen == 0) {
- doneChunks = true;
- break;
- }
-
- if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http
- .getMaxContent())
- chunkLen = http.getMaxContent() - contentBytesRead;
-
- // read one chunk
- int chunkBytesRead = 0;
- while (chunkBytesRead < chunkLen) {
-
- int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
- (chunkLen - chunkBytesRead) :
- Http.BUFFER_SIZE;
- int len = in.read(bytes, 0, toRead);
-
- if (len == -1)
- throw new HttpException("chunk eof after " + contentBytesRead
- + " bytes in successful chunks" + " and " + chunkBytesRead
- + " in current chunk");
-
- // DANGER!!! Will printed GZIPed stuff right to your
- // terminal!
- // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0,
- // len)); }
-
- out.write(bytes, 0, len);
- chunkBytesRead += len;
- }
-
- readLine(in, line, false);
-
- }
-
- if (!doneChunks) {
- if (contentBytesRead != http.getMaxContent())
- throw new HttpException("chunk eof: !doneChunk && didn't max out");
- return;
- }
-
- content = out.toByteArray();
- parseHeaders(in, line, null);
-
- }
-
- private int parseStatusLine(PushbackInputStream in, StringBuffer line)
- throws IOException, HttpException {
- readLine(in, line, false);
-
- int codeStart = line.indexOf(" ");
- int codeEnd = line.indexOf(" ", codeStart + 1);
-
- // handle lines with no plaintext result code, ie:
- // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
- if (codeEnd == -1)
- codeEnd = line.length();
-
- int code;
- try {
- code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
- } catch (NumberFormatException e) {
- throw new HttpException(
- "bad status line '" + line + "': " + e.getMessage(), e);
- }
-
- return code;
- }
-
- private void processHeaderLine(StringBuffer line)
- throws IOException, HttpException {
-
- int colonIndex = line.indexOf(":"); // key is up to colon
- if (colonIndex == -1) {
- int i;
- for (i = 0; i < line.length(); i++)
- if (!Character.isWhitespace(line.charAt(i)))
- break;
- if (i == line.length())
- return;
- throw new HttpException("No colon in header:" + line);
- }
- String key = line.substring(0, colonIndex);
-
- int valueStart = colonIndex + 1; // skip whitespace
- while (valueStart < line.length()) {
- int c = line.charAt(valueStart);
- if (c != ' ' && c != '\t')
- break;
- valueStart++;
- }
- String value = line.substring(valueStart);
- headers.set(key, value);
- }
-
- // Adds headers to our headers Metadata
- private void parseHeaders(PushbackInputStream in, StringBuffer line,
- StringBuffer httpHeaders) throws IOException, HttpException {
-
- while (readLine(in, line, true) != 0) {
-
- if (httpHeaders != null)
- httpHeaders.append(line).append("\n");
-
- // handle HTTP responses with missing blank line after headers
- int pos;
- if (((pos = line.indexOf("<!DOCTYPE")) != -1) || (
- (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html"))
- != -1)) {
-
- in.unread(line.substring(pos).getBytes("UTF-8"));
- line.setLength(pos);
-
- try {
- // TODO: (CM) We don't know the header names here
- // since we're just handling them generically. It would
- // be nice to provide some sort of mapping function here
- // for the returned header names to the standard metadata
- // names in the ParseData class
- processHeaderLine(line);
- } catch (Exception e) {
- // fixme:
- Http.LOG.warn("Error: ", e);
- }
- return;
- }
-
- processHeaderLine(line);
- }
- }
-
- private static int readLine(PushbackInputStream in, StringBuffer line,
- boolean allowContinuedLine) throws IOException {
- line.setLength(0);
- for (int c = in.read(); c != -1; c = in.read()) {
- switch (c) {
- case '\r':
- if (peek(in) == '\n') {
- in.read();
- }
- case '\n':
- if (line.length() > 0) {
- // at EOL -- check for continued line if the current
- // (possibly continued) line wasn't blank
- if (allowContinuedLine)
- switch (peek(in)) {
- case ' ':
- case '\t': // line is continued
- in.read();
- continue;
- }
- }
- return line.length(); // else complete
- default:
- line.append((char) c);
- }
- }
- throw new EOFException();
- }
-
- private static int peek(PushbackInputStream in) throws IOException {
- int value = in.read();
- in.unread(value);
- return value;
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html
deleted file mode 100644
index 34d1d1c..0000000
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p>
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml b/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml
deleted file mode 100644
index a9afd78..0000000
--- a/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<configuration>
-
-<property>
- <name>http.robots.agents</name>
- <value>Nutch-Test,*</value>
- <description></description>
-</property>
-
-<property>
- <name>http.agent.name</name>
- <value>Nutch-Test</value>
- <description></description>
-</property>
-
-<property>
- <name>http.agent.description</name>
- <value>Nutch protocol-httpclient test</value>
- <description></description>
-</property>
-
-<property>
- <name>http.auth.file</name>
- <value>httpclient-auth-test.xml</value>
- <description></description>
-</property>
-
-<property>
- <name>http.timeout</name>
- <value>60000</value>
- <description></description>
-</property>
-
-</configuration>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
deleted file mode 100644
index 7dd9e9b..0000000
--- a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.http;
-
-import static org.junit.Assert.assertEquals;
-
-import java.net.URL;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.junit.After;
-import org.junit.Test;
-import org.mortbay.jetty.Server;
-import org.mortbay.jetty.nio.SelectChannelConnector;
-import org.mortbay.jetty.servlet.Context;
-import org.mortbay.jetty.servlet.ServletHolder;
-
-/**
- * Test cases for protocol-http
- */
-public class TestProtocolHttp {
- private static final String RES_DIR = System.getProperty("test.data", ".");
-
- private Http http;
- private Server server;
- private Context root;
- private Configuration conf;
- private int port;
-
- public void setUp(boolean redirection) throws Exception {
- conf = new Configuration();
- conf.addResource("nutch-default.xml");
- conf.addResource("nutch-site-test.xml");
-
- http = new Http();
- http.setConf(conf);
-
- server = new Server();
-
- if (redirection) {
- root = new Context(server, "/redirection", Context.SESSIONS);
- root.setAttribute("newContextURL", "/redirect");
- } else {
- root = new Context(server, "/", Context.SESSIONS);
- }
-
- ServletHolder sh = new ServletHolder(
- org.apache.jasper.servlet.JspServlet.class);
- root.addServlet(sh, "*.jsp");
- root.setResourceBase(RES_DIR);
- }
-
- @After
- public void tearDown() throws Exception {
- server.stop();
- }
-
- @Test
- public void testStatusCode() throws Exception {
- startServer(47504, false);
- fetchPage("/basic-http.jsp", 200);
- fetchPage("/redirect301.jsp", 301);
- fetchPage("/redirect302.jsp", 302);
- fetchPage("/nonexists.html", 404);
- fetchPage("/brokenpage.jsp", 500);
- }
-
- @Test
- public void testRedirectionJetty() throws Exception {
- // Redirection via Jetty
- startServer(47503, true);
- fetchPage("/redirection", 302);
- }
-
- /**
- * Starts the Jetty server at a specified port and redirection parameter.
- *
- * @param portno
- * Port number.
- * @param redirection
- * whether redirection
- */
- private void startServer(int portno, boolean redirection) throws Exception {
- port = portno;
- setUp(redirection);
- SelectChannelConnector connector = new SelectChannelConnector();
- connector.setHost("127.0.0.1");
- connector.setPort(port);
-
- server.addConnector(connector);
- server.start();
- }
-
- /**
- * Fetches the specified <code>page</code> from the local Jetty server and
- * checks whether the HTTP response status code matches with the expected
- * code. Also use jsp pages for redirection.
- *
- * @param page
- * Page to be fetched.
- * @param expectedCode
- * HTTP response status code expected while fetching the page.
- */
- private void fetchPage(String page, int expectedCode) throws Exception {
- URL url = new URL("http", "127.0.0.1", port, page);
- CrawlDatum crawlDatum = new CrawlDatum();
- Response response = http.getResponse(url, crawlDatum, true);
- ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
- crawlDatum);
- Content content = out.getContent();
- assertEquals("HTTP Status Code for " + url, expectedCode,
- response.getCode());
-
- if (page.compareTo("/nonexists.html") != 0
- && page.compareTo("/brokenpage.jsp") != 0
- && page.compareTo("/redirection") != 0) {
- assertEquals("ContentType " + url, "text/html",
- content.getContentType());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/build.xml b/src/plugin/protocol-httpclient/build.xml
deleted file mode 100644
index b66eb97..0000000
--- a/src/plugin/protocol-httpclient/build.xml
+++ /dev/null
@@ -1,45 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="protocol-httpclient" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <target name="deps-jar">
- <ant target="jar" inheritall="false" dir="../lib-http"/>
- </target>
-
- <path id="plugin.deps">
- <fileset dir="${nutch.root}/build">
- <include name="**/lib-http/*.jar" />
- </fileset>
- <pathelement location="${build.dir}/test/conf"/>
- </path>
-
- <target name="deps-test">
- <copy toDir="${build.test}">
- <fileset dir="${src.test}" excludes="**/*.java"/>
- </copy>
- </target>
-
- <!-- for junit test -->
- <mkdir dir="${build.test}/data" />
- <copy todir="${build.test}/data">
- <fileset dir="jsp"/>
- </copy>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/ivy.xml b/src/plugin/protocol-httpclient/ivy.xml
deleted file mode 100644
index 00b6f07..0000000
--- a/src/plugin/protocol-httpclient/ivy.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- <dependency org="org.jsoup" name="jsoup" rev="1.8.1" />
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/basic.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/jsp/basic.jsp b/src/plugin/protocol-httpclient/jsp/basic.jsp
deleted file mode 100644
index c5bfb89..0000000
--- a/src/plugin/protocol-httpclient/jsp/basic.jsp
+++ /dev/null
@@ -1,74 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- This JSP demonstrates basic authentication. When this JSP page is
- requested with no query parameters, then the user must enter the
- username as 'userx' and password as 'passx' when prompted for
- authentication. Apart from this there are a few other test cases,
- which can be used by passing a test case number as query parameter in
- the following manner: basic.jsp?case=1, basic.jsp?case=2, etc.
- The credentials for each test case can be easily figured out from the
- code below.
-
- Author: Susam Pal
---%><%@ page
- import = "sun.misc.BASE64Decoder"
-%><%
- String authHeader = request.getHeader("Authorization");
- String realm = null;
- String username = null;
- String password = null;
- int testCase = 0;
- try {
- testCase = Integer.parseInt(request.getParameter("case"));
- } catch (Exception ex) {
- // do nothing
- }
- switch (testCase) {
- case 1:
- realm = "realm1"; username = "user1"; password = "pass1";
- break;
-
- case 2:
- realm = "realm2"; username = "user2"; password = "pass2";
- break;
-
- default:
- realm = "realmx"; username = "userx"; password = "passx";
- break;
- }
-
- boolean authenticated = false;
- if (authHeader != null && authHeader.toUpperCase().startsWith("BASIC")) {
- String creds[] = new String(new BASE64Decoder().decodeBuffer(
- authHeader.substring(6))).split(":", 2);
- if (creds[0].equals(username) && creds[1].equals(password))
- authenticated = true;
- }
- if (!authenticated) {
- response.setHeader("WWW-Authenticate", "Basic realm=\"" + realm + "\"");
- response.sendError(response.SC_UNAUTHORIZED);
- } else {
-%>
-<html>
-<head><title>Basic Authentication Test</title></head>
-<body>
-<p>Hi <%= username %>, you have been successfully authenticated.</p>
-</body>
-</html>
-<%
- }
-%>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/cookies.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/jsp/cookies.jsp b/src/plugin/protocol-httpclient/jsp/cookies.jsp
deleted file mode 100644
index ae2ace2..0000000
--- a/src/plugin/protocol-httpclient/jsp/cookies.jsp
+++ /dev/null
@@ -1,63 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- This JSP tests whether the client can remember cookies. When the JSP
- is fetched for the first time without any query parameters, it sets
- a few cookies in the client. On a second request, with the query
- parameter, 'cookie=yes', it checks whether all the client has sent
- the cookies. If the cookies are found, HTTP 200 response is returned.
- If the cookies are not found, HTTP 403 response is returned.
-
- Author: Susam Pal
---%><%
- String cookieParam = request.getParameter("cookie");
- if (!"yes".equals(cookieParam)) { // Send cookies
- response.addCookie(new Cookie("var1", "val1"));
- response.addCookie(new Cookie("var2", "val2"));
-%>
-<html>
-<head><title>Cookies Set</title></head>
-<body><p>Cookies have been set.</p></body>
-</html>
-<%
- } else { // Check cookies
- int cookiesCount = 0;
-
- Cookie[] cookies = request.getCookies();
- if (cookies != null) {
- for (int i = 0; i < cookies.length; i++) {
- if (cookies[i].getName().equals("var1")
- && cookies[i].getValue().equals("val1"))
- cookiesCount++;
-
- if (cookies[i].getName().equals("var2")
- && cookies[i].getValue().equals("val2"))
- cookiesCount++;
- }
- }
-
- if (cookiesCount != 2) {
- response.sendError(response.SC_FORBIDDEN);
- } else {
-%>
-<html>
-<head><title>Cookies Found</title></head>
-<body><p>Cookies found!</p></body>
-</html>
-<%
- }
- }
-%>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/digest.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/jsp/digest.jsp b/src/plugin/protocol-httpclient/jsp/digest.jsp
deleted file mode 100644
index c657484..0000000
--- a/src/plugin/protocol-httpclient/jsp/digest.jsp
+++ /dev/null
@@ -1,68 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- This JSP tests digest authentication. It generates an HTTP response
- with authorization header for digest authentication and checks the
- user-name supplied by the client. It does not check the other
- parameters and hashes as controlled JUnit tests would be performed
- against this and only the proper submission of credentials need to
- be tested.
-
- Author: Susam Pal
---%><%@ page
- import = "java.util.StringTokenizer"
- import = "java.util.HashMap"
-%><%
- String username = "digest_user";
- String authHeader = request.getHeader("Authorization");
-
- boolean authenticated = false;
- if (authHeader != null && authHeader.toUpperCase().startsWith("DIGEST")) {
- HashMap map = new HashMap();
- StringTokenizer tokenizer = new StringTokenizer(
- authHeader.substring(7).trim(), ",");
- while (tokenizer.hasMoreTokens()) {
- String[] param = tokenizer.nextToken().trim().split("=", 2);
- if (param[1].charAt(0) == '"') {
- param[1] = param[1].substring(1, param[1].length() - 1);
- }
- map.put(param[0], param[1]);
- }
-
- if (username.equals((String)map.get("username")))
- authenticated = true;
- }
-
- if (!authenticated) {
- String realm = "realm=\"realm1\"";
- String qop = "qop=\"auth,auth-int\"";
- String nonce = "nonce=\"dcd98b7102dd2f0e8b11d0f600bfb0c093\"";
- String opaque = "opaque=\"5ccc069c403ebaf9f0171e9517f40e41\"";
-
- response.setHeader("WWW-Authenticate", "Digest " + realm + ", "
- + qop + ", " + nonce + ", " + opaque);
- response.sendError(response.SC_UNAUTHORIZED);
- } else {
-%>
-<html>
-<head><title>Digest Authentication Test</title></head>
-<body>
-<p>Hi <%= username %>, you have been successfully authenticated.</p>
-</body>
-</html>
-<%
- }
-%>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/noauth.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/jsp/noauth.jsp b/src/plugin/protocol-httpclient/jsp/noauth.jsp
deleted file mode 100644
index c726b0f..0000000
--- a/src/plugin/protocol-httpclient/jsp/noauth.jsp
+++ /dev/null
@@ -1,36 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- This JSP tests whether the client is sending any pre-emptive
- authentication headers. The client is expected not to send pre-emptive
- authentication headers. If such authentication headers are found, this
- JSP will return an HTTP 403 response; HTTP 200 response otherwise.
-
- Author: Susam Pal
---%><%
- if (request.getHeader("Authorization") != null) {
- response.sendError(response.SC_UNAUTHORIZED);
- } else {
-%>
-<html>
-<head><title>No authorization headers found</title></head>
-<body>
-<p>No authorization headers found.</p>
-</body>
-</html>
-<%
- }
-%>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/ntlm.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/jsp/ntlm.jsp b/src/plugin/protocol-httpclient/jsp/ntlm.jsp
deleted file mode 100644
index 6ad921e..0000000
--- a/src/plugin/protocol-httpclient/jsp/ntlm.jsp
+++ /dev/null
@@ -1,89 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- This JSP tests NTLM authentication. It generates an HTTP response
- with authorization header for NTLM authentication and checks the
- user-name supplied by the client. It does not check the other
- parameters and hashes as controlled JUnit tests would be performed
- against this and only the proper submission of credentials need to
- be tested.
-
- Author: Susam Pal
---%><%@ page
- import = "sun.misc.BASE64Decoder"
- import = "sun.misc.BASE64Encoder"
-%><%
- String authHeader = request.getHeader("Authorization");
- String username = null;
- String domain = null;
- String host = null;
-
- boolean authenticated = false;
- if (authHeader != null && authHeader.startsWith("NTLM")) {
- byte[] msg = new BASE64Decoder().decodeBuffer(
- authHeader.substring(5));
- if (msg[8] == 1) {
- byte[] type2msg = {
- 'N', 'T', 'L', 'M', 'S', 'S', 'P', 0, // NTLMSSP Signature
- 2, 0, 0, 0, // Type 2 Indicator
- 10, 0, 10, 0, 32, 0, 0, 0, // length, offset
- 0x00, 0x02, (byte) 0x81, 0, // Flags
- 1, 2, 3, 4, 5, 6, 7, 8, // Challenge
- 'N', 'U', 'T', 'C', 'H' // NUTCH (Domain)
- };
- response.setHeader("WWW-Authenticate", "NTLM "
- + new BASE64Encoder().encodeBuffer(type2msg));
- response.sendError(response.SC_UNAUTHORIZED);
- return;
- } else if (msg[8] == 3) {
- int length;
- int offset;
-
- // Get domain name
- length = msg[30] + msg[31] * 256;
- offset = msg[32] + msg[33] * 256;
- domain = new String(msg, offset, length);
-
- // Get user name
- length = msg[38] + msg[39] * 256;
- offset = msg[40] + msg[41] * 256;
- username = new String(msg, offset, length);
-
- // Get password
- length = msg[46] + msg[47] * 256;
- offset = msg[48] + msg[49] * 256;
- host = new String(msg, offset, length);
-
- if ("ntlm_user".equalsIgnoreCase(username)
- && "NUTCH".equalsIgnoreCase(domain))
- authenticated = true;
- }
- }
-
- if (!authenticated) {
- response.setHeader("WWW-Authenticate", "NTLM");
- response.sendError(response.SC_UNAUTHORIZED);
- } else {
-%>
-<html>
-<head>NTLM Authentication Test</head>
-<body>
-<p>Hi <%= username %>, You have been successfully authenticated.</p>
-</body>
-</html>
-<%
- }
-%>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/plugin.xml b/src/plugin/protocol-httpclient/plugin.xml
deleted file mode 100644
index 1747713..0000000
--- a/src/plugin/protocol-httpclient/plugin.xml
+++ /dev/null
@@ -1,58 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="protocol-httpclient"
- name="Http / Https Protocol Plug-in"
- version="1.0.0"
- provider-name="nutch.org">
-
- <runtime>
- <library name="protocol-httpclient.jar">
- <export name="*"/>
- </library>
- <library name="jsoup-1.8.1.jar"/>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- <import plugin="lib-http"/>
- </requires>
-
- <extension id="org.apache.nutch.protocol.httpclient"
- name="HttpProtocol"
- point="org.apache.nutch.protocol.Protocol">
-
- <implementation id="org.apache.nutch.protocol.httpclient.Http"
- class="org.apache.nutch.protocol.httpclient.Http">
- <parameter name="protocolName" value="http"/>
- </implementation>
-
- </extension>
-
- <extension id="org.apache.nutch.protocol.https"
- name="HttpsProtocol"
- point="org.apache.nutch.protocol.Protocol">
-
- <implementation id="org.apache.nutch.protocol.httpclient.Http"
- class="org.apache.nutch.protocol.httpclient.Http">
- <parameter name="protocolName" value="https"/>
- </implementation>
-
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
deleted file mode 100644
index afcf24a..0000000
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * Based on EasySSLProtocolSocketFactory from commons-httpclient:
- *
- * $Header:
- * /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/contrib/org/apache/commons/httpclient/contrib/ssl/DummySSLProtocolSocketFactory.java,v
- * 1.7 2004/06/11 19:26:27 olegk Exp $ $Revision$ $Date: 2005-02-26 05:01:52
- * -0800 (Sat, 26 Feb 2005) $
- */
-
-package org.apache.nutch.protocol.httpclient;
-
-import java.io.IOException;
-import java.net.InetAddress;
-import java.net.Socket;
-import java.net.UnknownHostException;
-
-import org.apache.commons.httpclient.ConnectTimeoutException;
-import org.apache.commons.httpclient.HttpClientError;
-import org.apache.commons.httpclient.params.HttpConnectionParams;
-import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory;
-import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import javax.net.ssl.SSLContext;
-import javax.net.ssl.TrustManager;
-
-public class DummySSLProtocolSocketFactory implements
- SecureProtocolSocketFactory {
-
- /** Logger object for this class. */
- private static final Logger LOG = LoggerFactory
- .getLogger(DummySSLProtocolSocketFactory.class);
-
- private SSLContext sslcontext = null;
-
- /**
- * Constructor for DummySSLProtocolSocketFactory.
- */
- public DummySSLProtocolSocketFactory() {
- super();
- }
-
- private static SSLContext createEasySSLContext() {
- try {
- SSLContext context = SSLContext.getInstance("SSL");
- context.init(null,
- new TrustManager[] { new DummyX509TrustManager(null) }, null);
- return context;
- } catch (Exception e) {
- if (LOG.isErrorEnabled()) {
- LOG.error(e.getMessage(), e);
- }
- throw new HttpClientError(e.toString());
- }
- }
-
- private SSLContext getSSLContext() {
- if (this.sslcontext == null) {
- this.sslcontext = createEasySSLContext();
- }
- return this.sslcontext;
- }
-
- /**
- * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int,InetAddress,int)
- */
- public Socket createSocket(String host, int port, InetAddress clientHost,
- int clientPort) throws IOException, UnknownHostException {
-
- return getSSLContext().getSocketFactory().createSocket(host, port,
- clientHost, clientPort);
- }
-
- /**
- * Attempts to get a new socket connection to the given host within the given
- * time limit.
- * <p>
- * To circumvent the limitations of older JREs that do not support connect
- * timeout a controller thread is executed. The controller thread attempts to
- * create a new socket within the given limit of time. If socket constructor
- * does not return until the timeout expires, the controller terminates and
- * throws an {@link ConnectTimeoutException}
- * </p>
- *
- * @param host
- * the host name/IP
- * @param port
- * the port on the host
- * @param localAddress
- * the local host name/IP to bind the socket to
- * @param localPort
- * the port on the local machine
- * @param params
- * {@link HttpConnectionParams Http connection parameters}
- *
- * @return Socket a new socket
- *
- * @throws IOException
- * if an I/O error occurs while creating the socket
- * @throws UnknownHostException
- * if the IP address of the host cannot be determined
- */
- public Socket createSocket(final String host, final int port,
- final InetAddress localAddress, final int localPort,
- final HttpConnectionParams params) throws IOException,
- UnknownHostException, ConnectTimeoutException {
- if (params == null) {
- throw new IllegalArgumentException("Parameters may not be null");
- }
- int timeout = params.getConnectionTimeout();
- if (timeout == 0) {
- return createSocket(host, port, localAddress, localPort);
- } else {
- // To be eventually deprecated when migrated to Java 1.4 or above
- return ControllerThreadSocketFactory.createSocket(this, host, port,
- localAddress, localPort, timeout);
- }
- }
-
- /**
- * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int)
- */
- public Socket createSocket(String host, int port) throws IOException,
- UnknownHostException {
- return getSSLContext().getSocketFactory().createSocket(host, port);
- }
-
- /**
- * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(Socket,String,int,boolean)
- */
- public Socket createSocket(Socket socket, String host, int port,
- boolean autoClose) throws IOException, UnknownHostException {
- return getSSLContext().getSocketFactory().createSocket(socket, host, port,
- autoClose);
- }
-
- public boolean equals(Object obj) {
- return ((obj != null) && obj.getClass().equals(
- DummySSLProtocolSocketFactory.class));
- }
-
- public int hashCode() {
- return DummySSLProtocolSocketFactory.class.hashCode();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
deleted file mode 100644
index b5509cc..0000000
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * Based on EasyX509TrustManager from commons-httpclient.
- */
-
-package org.apache.nutch.protocol.httpclient;
-
-import java.security.KeyStore;
-import java.security.KeyStoreException;
-import java.security.NoSuchAlgorithmException;
-import java.security.cert.CertificateException;
-import java.security.cert.X509Certificate;
-
-import javax.net.ssl.TrustManagerFactory;
-import javax.net.ssl.TrustManager;
-import javax.net.ssl.X509TrustManager;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class DummyX509TrustManager implements X509TrustManager {
- private X509TrustManager standardTrustManager = null;
-
- /** Logger object for this class. */
- private static final Logger LOG = LoggerFactory
- .getLogger(DummyX509TrustManager.class);
-
- /**
- * Constructor for DummyX509TrustManager.
- */
- public DummyX509TrustManager(KeyStore keystore)
- throws NoSuchAlgorithmException, KeyStoreException {
- super();
- String algo = TrustManagerFactory.getDefaultAlgorithm();
- TrustManagerFactory factory = TrustManagerFactory.getInstance(algo);
- factory.init(keystore);
- TrustManager[] trustmanagers = factory.getTrustManagers();
- if (trustmanagers.length == 0) {
- throw new NoSuchAlgorithmException(algo + " trust manager not supported");
- }
- this.standardTrustManager = (X509TrustManager) trustmanagers[0];
- }
-
- /**
- * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
- * String)
- */
- public boolean isClientTrusted(X509Certificate[] certificates) {
- return true;
- }
-
- /**
- * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
- * String)
- */
- public boolean isServerTrusted(X509Certificate[] certificates) {
- return true;
- }
-
- /**
- * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
- */
- public X509Certificate[] getAcceptedIssuers() {
- return this.standardTrustManager.getAcceptedIssuers();
- }
-
- public void checkClientTrusted(X509Certificate[] arg0, String arg1)
- throws CertificateException {
- // do nothing
-
- }
-
- public void checkServerTrusted(X509Certificate[] arg0, String arg1)
- throws CertificateException {
- // do nothing
-
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
deleted file mode 100644
index 75506ce..0000000
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
+++ /dev/null
@@ -1,572 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.httpclient;
-
-// JDK imports
-import java.io.InputStream;
-import java.io.IOException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-
-import org.xml.sax.SAXException;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
-import org.w3c.dom.Node;
-
-// Slf4j Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-// HTTP Client imports
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.HostConfiguration;
-import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
-import org.apache.commons.httpclient.NTCredentials;
-import org.apache.commons.httpclient.auth.AuthScope;
-import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
-import org.apache.commons.httpclient.protocol.Protocol;
-import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
-// NUTCH-1929 Consider implementing dependency injection for crawl HTTPS sites that use self signed certificates
-//import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;
-
-import org.apache.commons.lang.StringUtils;
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- * <p>
- * This class is a protocol plugin that configures an HTTP client for Basic,
- * Digest and NTLM authentication schemes for web server as well as proxy
- * server. It takes care of HTTPS protocol as well as cookies in a single fetch
- * session.
- * </p>
- * <p>
- * Documentation can be found on the Nutch <a
- * href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes"
- * >HttpAuthenticationSchemes</a> wiki page.
- * </p>
- * <p>
- * The original description of the motivation to support <a
- * href="https://wiki.apache.org/nutch/HttpPostAuthentication"
- * >HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally
- * HttpPostAuthentication development is documented at the <a
- * href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira
- * issue.
- *
- * @author Susam Pal
- */
-public class Http extends HttpBase {
-
- public static final Logger LOG = LoggerFactory.getLogger(Http.class);
-
- private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
-
- // Since the Configuration has not yet been set,
- // then an unconfigured client is returned.
- private static HttpClient client = new HttpClient(connectionManager);
- private static String defaultUsername;
- private static String defaultPassword;
- private static String defaultRealm;
- private static String defaultScheme;
- private static String authFile;
- private static String agentHost;
- private static boolean authRulesRead = false;
- private static Configuration conf;
-
- private int maxThreadsTotal = 10;
-
- private String proxyUsername;
- private String proxyPassword;
- private String proxyRealm;
-
- private static HttpFormAuthConfigurer formConfigurer;
-
- /**
- * Returns the configured HTTP client.
- *
- * @return HTTP client
- */
- static synchronized HttpClient getClient() {
- return client;
- }
-
- /**
- * Constructs this plugin.
- */
- public Http() {
- super(LOG);
- }
-
- /**
- * Reads the configuration from the Nutch configuration files and sets the
- * configuration.
- *
- * @param conf
- * Configuration
- */
- public void setConf(Configuration conf) {
- super.setConf(conf);
- this.conf = conf;
- this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
- this.proxyUsername = conf.get("http.proxy.username", "");
- this.proxyPassword = conf.get("http.proxy.password", "");
- this.proxyRealm = conf.get("http.proxy.realm", "");
- agentHost = conf.get("http.agent.host", "");
- authFile = conf.get("http.auth.file", "");
- configureClient();
- try {
- setCredentials();
- } catch (Exception ex) {
- if (LOG.isErrorEnabled()) {
- LOG.error("Could not read " + authFile + " : " + ex.getMessage());
- }
- }
- }
-
- /**
- * Main method.
- *
- * @param args
- * Command line arguments
- */
- public static void main(String[] args) throws Exception {
- Http http = new Http();
- http.setConf(NutchConfiguration.create());
- main(http, args);
- }
-
- /**
- * Fetches the <code>url</code> with a configured HTTP client and gets the
- * response.
- *
- * @param url
- * URL to be fetched
- * @param datum
- * Crawl data
- * @param redirect
- * Follow redirects if and only if true
- * @return HTTP response
- */
- protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
- throws ProtocolException, IOException {
- resolveCredentials(url);
- return new HttpResponse(this, url, datum, redirect);
- }
-
- /**
- * Configures the HTTP client
- */
- private void configureClient() {
-
- // Set up an HTTPS socket factory that accepts self-signed certs.
- // ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
- ProtocolSocketFactory factory = new DummySSLProtocolSocketFactory();
- Protocol https = new Protocol("https", factory, 443);
- Protocol.registerProtocol("https", https);
-
- HttpConnectionManagerParams params = connectionManager.getParams();
- params.setConnectionTimeout(timeout);
- params.setSoTimeout(timeout);
- params.setSendBufferSize(BUFFER_SIZE);
- params.setReceiveBufferSize(BUFFER_SIZE);
-
- // --------------------------------------------------------------------------------
- // NUTCH-1836: Modification to increase the number of available connections
- // for multi-threaded crawls.
- // --------------------------------------------------------------------------------
- params.setMaxTotalConnections(conf.getInt(
- "mapred.tasktracker.map.tasks.maximum", 5)
- * conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
-
- // Also set max connections per host to maxThreadsTotal since all threads
- // might be used to fetch from the same host - otherwise timeout errors can
- // occur
- params.setDefaultMaxConnectionsPerHost(conf.getInt(
- "fetcher.threads.fetch", maxThreadsTotal));
-
- // executeMethod(HttpMethod) seems to ignore the connection timeout on the
- // connection manager.
- // set it explicitly on the HttpClient.
- client.getParams().setConnectionManagerTimeout(timeout);
-
- HostConfiguration hostConf = client.getHostConfiguration();
- ArrayList<Header> headers = new ArrayList<Header>();
- // Set the User Agent in the header
- // headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941
- // prefer English
- headers.add(new Header("Accept-Language", acceptLanguage));
- // prefer UTF-8
- headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
- // prefer understandable formats
- headers
- .add(new Header(
- "Accept",
- "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
- // accept gzipped content
- headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
- hostConf.getParams().setParameter("http.default-headers", headers);
-
- // HTTP proxy server details
- if (useProxy) {
- hostConf.setProxy(proxyHost, proxyPort);
-
- if (proxyUsername.length() > 0) {
-
- AuthScope proxyAuthScope = getAuthScope(this.proxyHost, this.proxyPort,
- this.proxyRealm);
-
- NTCredentials proxyCredentials = new NTCredentials(this.proxyUsername,
- this.proxyPassword, Http.agentHost, this.proxyRealm);
-
- client.getState().setProxyCredentials(proxyAuthScope, proxyCredentials);
- }
- }
-
- }
-
- /**
- * Reads authentication configuration file (defined as 'http.auth.file' in
- * Nutch configuration file) and sets the credentials for the configured
- * authentication scopes in the HTTP client object.
- *
- * @throws ParserConfigurationException
- * If a document builder can not be created.
- * @throws SAXException
- * If any parsing error occurs.
- * @throws IOException
- * If any I/O error occurs.
- */
- private static synchronized void setCredentials()
- throws ParserConfigurationException, SAXException, IOException {
-
- if (authRulesRead)
- return;
-
- authRulesRead = true; // Avoid re-attempting to read
-
- InputStream is = conf.getConfResourceAsInputStream(authFile);
- if (is != null) {
- Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
- .parse(is);
-
- Element rootElement = doc.getDocumentElement();
- if (!"auth-configuration".equals(rootElement.getTagName())) {
- if (LOG.isWarnEnabled())
- LOG.warn("Bad auth conf file: root element <"
- + rootElement.getTagName() + "> found in " + authFile
- + " - must be <auth-configuration>");
- }
-
- // For each set of credentials
- NodeList credList = rootElement.getChildNodes();
- for (int i = 0; i < credList.getLength(); i++) {
- Node credNode = credList.item(i);
- if (!(credNode instanceof Element))
- continue;
-
- Element credElement = (Element) credNode;
- if (!"credentials".equals(credElement.getTagName())) {
- if (LOG.isWarnEnabled())
- LOG.warn("Bad auth conf file: Element <" + credElement.getTagName()
- + "> not recognized in " + authFile
- + " - expected <credentials>");
- continue;
- }
-
- String authMethod = credElement.getAttribute("authMethod");
- // read http form post auth info
- if (StringUtils.isNotBlank(authMethod)) {
- formConfigurer = readFormAuthConfigurer(credElement, authMethod);
- continue;
- }
-
- String username = credElement.getAttribute("username");
- String password = credElement.getAttribute("password");
-
- // For each authentication scope
- NodeList scopeList = credElement.getChildNodes();
- for (int j = 0; j < scopeList.getLength(); j++) {
- Node scopeNode = scopeList.item(j);
- if (!(scopeNode instanceof Element))
- continue;
-
- Element scopeElement = (Element) scopeNode;
-
- if ("default".equals(scopeElement.getTagName())) {
-
- // Determine realm and scheme, if any
- String realm = scopeElement.getAttribute("realm");
- String scheme = scopeElement.getAttribute("scheme");
-
- // Set default credentials
- defaultUsername = username;
- defaultPassword = password;
- defaultRealm = realm;
- defaultScheme = scheme;
-
- if (LOG.isTraceEnabled()) {
- LOG.trace("Credentials - username: " + username
- + "; set as default" + " for realm: " + realm + "; scheme: "
- + scheme);
- }
-
- } else if ("authscope".equals(scopeElement.getTagName())) {
-
- // Determine authentication scope details
- String host = scopeElement.getAttribute("host");
- int port = -1; // For setting port to AuthScope.ANY_PORT
- try {
- port = Integer.parseInt(scopeElement.getAttribute("port"));
- } catch (Exception ex) {
- // do nothing, port is already set to any port
- }
- String realm = scopeElement.getAttribute("realm");
- String scheme = scopeElement.getAttribute("scheme");
-
- // Set credentials for the determined scope
- AuthScope authScope = getAuthScope(host, port, realm, scheme);
- NTCredentials credentials = new NTCredentials(username, password,
- agentHost, realm);
-
- client.getState().setCredentials(authScope, credentials);
-
- if (LOG.isTraceEnabled()) {
- LOG.trace("Credentials - username: " + username
- + "; set for AuthScope - " + "host: " + host + "; port: "
- + port + "; realm: " + realm + "; scheme: " + scheme);
- }
-
- } else {
- if (LOG.isWarnEnabled())
- LOG.warn("Bad auth conf file: Element <"
- + scopeElement.getTagName() + "> not recognized in "
- + authFile + " - expected <authscope>");
- }
- }
- is.close();
- }
- }
- }
-
- /**
- * <auth-configuration> <credentials authMethod="formAuth" loginUrl="loginUrl"
- * loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field
- * name="username" value="user1"/> </loginPostData> <additionalPostHeaders>
- * <field name="header1" value="vaule1"/> </additionalPostHeaders>
- * <removedFormFields> <field name="header1"/> </removedFormFields>
- * </credentials> </auth-configuration>
- */
- private static HttpFormAuthConfigurer readFormAuthConfigurer(
- Element credElement, String authMethod) {
- if ("formAuth".equals(authMethod)) {
- HttpFormAuthConfigurer formConfigurer = new HttpFormAuthConfigurer();
-
- String str = credElement.getAttribute("loginUrl");
- if (StringUtils.isNotBlank(str)) {
- formConfigurer.setLoginUrl(str.trim());
- } else {
- throw new IllegalArgumentException("Must set loginUrl.");
- }
- str = credElement.getAttribute("loginFormId");
- if (StringUtils.isNotBlank(str)) {
- formConfigurer.setLoginFormId(str.trim());
- } else {
- throw new IllegalArgumentException("Must set loginFormId.");
- }
- str = credElement.getAttribute("loginRedirect");
- if (StringUtils.isNotBlank(str)) {
- formConfigurer.setLoginRedirect(Boolean.parseBoolean(str));
- }
-
- NodeList nodeList = credElement.getChildNodes();
- for (int j = 0; j < nodeList.getLength(); j++) {
- Node node = nodeList.item(j);
- if (!(node instanceof Element))
- continue;
-
- Element element = (Element) node;
- if ("loginPostData".equals(element.getTagName())) {
- Map<String, String> loginPostData = new HashMap<String, String>();
- NodeList childNodes = element.getChildNodes();
- for (int k = 0; k < childNodes.getLength(); k++) {
- Node fieldNode = childNodes.item(k);
- if (!(fieldNode instanceof Element))
- continue;
-
- Element fieldElement = (Element) fieldNode;
- String name = fieldElement.getAttribute("name");
- String value = fieldElement.getAttribute("value");
- loginPostData.put(name, value);
- }
- formConfigurer.setLoginPostData(loginPostData);
- } else if ("additionalPostHeaders".equals(element.getTagName())) {
- Map<String, String> additionalPostHeaders = new HashMap<String, String>();
- NodeList childNodes = element.getChildNodes();
- for (int k = 0; k < childNodes.getLength(); k++) {
- Node fieldNode = childNodes.item(k);
- if (!(fieldNode instanceof Element))
- continue;
-
- Element fieldElement = (Element) fieldNode;
- String name = fieldElement.getAttribute("name");
- String value = fieldElement.getAttribute("value");
- additionalPostHeaders.put(name, value);
- }
- formConfigurer.setAdditionalPostHeaders(additionalPostHeaders);
- } else if ("removedFormFields".equals(element.getTagName())) {
- Set<String> removedFormFields = new HashSet<String>();
- NodeList childNodes = element.getChildNodes();
- for (int k = 0; k < childNodes.getLength(); k++) {
- Node fieldNode = childNodes.item(k);
- if (!(fieldNode instanceof Element))
- continue;
-
- Element fieldElement = (Element) fieldNode;
- String name = fieldElement.getAttribute("name");
- removedFormFields.add(name);
- }
- formConfigurer.setRemovedFormFields(removedFormFields);
- }
- }
-
- return formConfigurer;
- } else {
- throw new IllegalArgumentException("Unsupported authMethod: "
- + authMethod);
- }
- }
-
- /**
- * If credentials for the authentication scope determined from the specified
- * <code>url</code> is not already set in the HTTP client, then this method
- * sets the default credentials to fetch the specified <code>url</code>. If
- * credentials are found for the authentication scope, the method returns
- * without altering the client.
- *
- * @param url
- * URL to be fetched
- */
- private void resolveCredentials(URL url) {
-
- if (formConfigurer != null) {
- HttpFormAuthentication formAuther = new HttpFormAuthentication(
- formConfigurer, client, this);
- try {
- formAuther.login();
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
-
- return;
- }
-
- if (defaultUsername != null && defaultUsername.length() > 0) {
-
- int port = url.getPort();
- if (port == -1) {
- if ("https".equals(url.getProtocol()))
- port = 443;
- else
- port = 80;
- }
-
- AuthScope scope = new AuthScope(url.getHost(), port);
-
- if (client.getState().getCredentials(scope) != null) {
- if (LOG.isTraceEnabled())
- LOG.trace("Pre-configured credentials with scope - host: "
- + url.getHost() + "; port: " + port + "; found for url: " + url);
-
- // Credentials are already configured, so do nothing and return
- return;
- }
-
- if (LOG.isTraceEnabled())
- LOG.trace("Pre-configured credentials with scope - host: "
- + url.getHost() + "; port: " + port + "; not found for url: " + url);
-
- AuthScope serverAuthScope = getAuthScope(url.getHost(), port,
- defaultRealm, defaultScheme);
-
- NTCredentials serverCredentials = new NTCredentials(defaultUsername,
- defaultPassword, agentHost, defaultRealm);
-
- client.getState().setCredentials(serverAuthScope, serverCredentials);
- }
- }
-
- /**
- * Returns an authentication scope for the specified <code>host</code>,
- * <code>port</code>, <code>realm</code> and <code>scheme</code>.
- *
- * @param host
- * Host name or address.
- * @param port
- * Port number.
- * @param realm
- * Authentication realm.
- * @param scheme
- * Authentication scheme.
- */
- private static AuthScope getAuthScope(String host, int port, String realm,
- String scheme) {
-
- if (host.length() == 0)
- host = null;
-
- if (port < 0)
- port = -1;
-
- if (realm.length() == 0)
- realm = null;
-
- if (scheme.length() == 0)
- scheme = null;
-
- return new AuthScope(host, port, realm, scheme);
- }
-
- /**
- * Returns an authentication scope for the specified <code>host</code>,
- * <code>port</code> and <code>realm</code>.
- *
- * @param host
- * Host name or address.
- * @param port
- * Port number.
- * @param realm
- * Authentication realm.
- */
- private static AuthScope getAuthScope(String host, int port, String realm) {
-
- return getAuthScope(host, port, realm, "");
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
deleted file mode 100644
index 54dc905..0000000
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.httpclient;
-
-import java.util.List;
-
-/**
- * The base level of services required for Http Authentication
- *
- * @see HttpAuthenticationFactory
- *
- * @author Matt Tencati
- */
-public interface HttpAuthentication {
-
- /**
- * Gets the credentials generated by the HttpAuthentication object. May return
- * null.
- *
- * @return The credentials value
- */
- public List<String> getCredentials();
-
- /**
- * Gets the realm used by the HttpAuthentication object during creation.
- *
- * @return The realm value
- */
- public String getRealm();
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
deleted file mode 100644
index daff5ec..0000000
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.httpclient;
-
-/**
- * Can be used to identify problems during creation of Authentication objects.
- * In the future it may be used as a method of collecting authentication
- * failures during Http protocol transfer in order to present the user with
- * credentials required during a future fetch.
- *
- * @author Matt Tencati
- */
-public class HttpAuthenticationException extends Exception {
-
- /**
- * Constructs a new exception with null as its detail message.
- */
- public HttpAuthenticationException() {
- super();
- }
-
- /**
- * Constructs a new exception with the specified detail message.
- *
- * @param message
- * the detail message. The detail message is saved for later
- * retrieval by the {@link Throwable#getMessage()} method.
- */
- public HttpAuthenticationException(String message) {
- super(message);
- }
-
- /**
- * Constructs a new exception with the specified message and cause.
- *
- * @param message
- * the detail message. The detail message is saved for later
- * retrieval by the {@link Throwable#getMessage()} method.
- * @param cause
- * the cause (use {@link #getCause()} to retrieve the cause)
- */
- public HttpAuthenticationException(String message, Throwable cause) {
- super(message, cause);
- }
-
- /**
- * Constructs a new exception with the specified cause and detail message from
- * given clause if it is not null.
- *
- * @param cause
- * the cause (use {@link #getCause()} to retrieve the cause)
- */
- public HttpAuthenticationException(Throwable cause) {
- super(cause);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
deleted file mode 100644
index 064a6d0..0000000
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.httpclient;
-
-// JDK imports
-import java.util.ArrayList;
-import java.util.Collection;
-
-// Slf4j Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configurable;
-
-// Nutch imports
-import org.apache.nutch.metadata.Metadata;
-
-/**
- * Provides the Http protocol implementation with the ability to authenticate
- * when prompted. The goal is to provide multiple authentication types but for
- * now just the {@link HttpBasicAuthentication} authentication type is provided.
- *
- * @see HttpBasicAuthentication
- * @see Http
- * @see HttpResponse
- *
- * @author Matt Tencati
- */
-public class HttpAuthenticationFactory implements Configurable {
-
- /**
- * The HTTP Authentication (WWW-Authenticate) header which is returned by a
- * webserver requiring authentication.
- */
- public static final String WWW_AUTHENTICATE = "WWW-Authenticate";
-
- public static final Logger LOG = LoggerFactory
- .getLogger(HttpAuthenticationFactory.class);
-
- private Configuration conf = null;
-
- public HttpAuthenticationFactory(Configuration conf) {
- setConf(conf);
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return conf;
- }
-
- public HttpAuthentication findAuthentication(Metadata header) {
-
- if (header == null)
- return null;
-
- try {
- Collection<String> challenge = new ArrayList<String>();
- challenge.add(header.get(WWW_AUTHENTICATE));
-
- for (String challengeString : challenge) {
- if (challengeString.equals("NTLM"))
- challengeString = "Basic realm=techweb";
-
- if (LOG.isTraceEnabled())
- LOG.trace("Checking challengeString=" + challengeString);
-
- HttpAuthentication auth = HttpBasicAuthentication.getAuthentication(
- challengeString, conf);
- if (auth != null)
- return auth;
-
- // TODO Add additional Authentication lookups here
- }
- } catch (Exception e) {
- LOG.error("Error: ", e);
- }
- return null;
- }
-}