You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/09 07:34:37 UTC
svn commit: r1650447 [21/25] - in /nutch/branches/2.x: ./
src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/
src/java/org/apache/nutch/api/impl/db/
src/java/org/apache/nutch/api/model/response/
src/java/org/apache/nutch/api/resources/ s...
Modified: nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Fri Jan 9 06:34:33 2015
@@ -51,18 +51,17 @@ public class HttpResponse implements Res
private final Metadata headers = new SpellCheckedMetadata();
protected enum Scheme {
- HTTP,
- HTTPS,
+ HTTP, HTTPS,
}
public HttpResponse(HttpBase http, URL url, WebPage page)
- throws ProtocolException, IOException {
+ throws ProtocolException, IOException {
this.http = http;
this.url = url;
Scheme scheme = null;
-
+
if ("http".equals(url.getProtocol())) {
scheme = Scheme.HTTP;
} else if ("https".equals(url.getProtocol())) {
@@ -90,50 +89,56 @@ public class HttpResponse implements Res
} else {
port = 443;
}
- portString= "";
+ portString = "";
} else {
- port= url.getPort();
- portString= ":" + port;
+ port = url.getPort();
+ portString = ":" + port;
}
Socket socket = null;
try {
- socket = new Socket(); // create the socket
+ socket = new Socket(); // create the socket
socket.setSoTimeout(http.getTimeout());
-
// connect
String sockHost = http.useProxy() ? http.getProxyHost() : host;
int sockPort = http.useProxy() ? http.getProxyPort() : port;
- InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
+ InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
-
+
if (scheme == Scheme.HTTPS) {
- SSLSocketFactory factory = (SSLSocketFactory)SSLSocketFactory.getDefault();
- SSLSocket sslsocket = (SSLSocket)factory.createSocket(socket, sockHost, sockPort, true);
+ SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
+ .getDefault();
+ SSLSocket sslsocket = (SSLSocket) factory.createSocket(socket,
+ sockHost, sockPort, true);
sslsocket.setUseClientMode(true);
-
- // Get the protocols and ciphers supported by this JVM
- Set<String> protocols = new HashSet<String>(Arrays.asList(sslsocket.getSupportedProtocols()));
- Set<String> ciphers = new HashSet<String>(Arrays.asList(sslsocket.getSupportedCipherSuites()));
-
+
+ // Get the protocols and ciphers supported by this JVM
+ Set<String> protocols = new HashSet<String>(Arrays.asList(sslsocket
+ .getSupportedProtocols()));
+ Set<String> ciphers = new HashSet<String>(Arrays.asList(sslsocket
+ .getSupportedCipherSuites()));
+
// Intersect with preferred protocols and ciphers
protocols.retainAll(http.getTlsPreferredProtocols());
ciphers.retainAll(http.getTlsPreferredCipherSuites());
-
- sslsocket.setEnabledProtocols(protocols.toArray(new String[protocols.size()]));
- sslsocket.setEnabledCipherSuites(ciphers.toArray(new String[ciphers.size()]));
-
+
+ sslsocket.setEnabledProtocols(protocols.toArray(new String[protocols
+ .size()]));
+ sslsocket.setEnabledCipherSuites(ciphers.toArray(new String[ciphers
+ .size()]));
+
sslsocket.startHandshake();
socket = sslsocket;
}
-
+
conf = http.getConf();
if (sockAddr != null
&& conf.getBoolean("store.ip.address", false) == true) {
- String ipString = sockAddr.getAddress().getHostAddress(); //get the ip address
+ String ipString = sockAddr.getAddress().getHostAddress(); // get the ip
+ // address
page.getMetadata().put(new Utf8("_ip_"),
- ByteBuffer.wrap(ipString.getBytes()));
+ ByteBuffer.wrap(ipString.getBytes()));
}
// make request
@@ -141,9 +146,9 @@ public class HttpResponse implements Res
StringBuffer reqStr = new StringBuffer("GET ");
if (http.useProxy()) {
- reqStr.append(url.getProtocol()+"://"+host+portString+path);
+ reqStr.append(url.getProtocol() + "://" + host + portString + path);
} else {
- reqStr.append(path);
+ reqStr.append(path);
}
reqStr.append(" HTTP/1.0\r\n");
@@ -161,39 +166,40 @@ public class HttpResponse implements Res
String userAgent = http.getUserAgent();
if ((userAgent == null) || (userAgent.length() == 0)) {
- if (Http.LOG.isErrorEnabled()) { Http.LOG.error("User-agent is not set!"); }
+ if (Http.LOG.isErrorEnabled()) {
+ Http.LOG.error("User-agent is not set!");
+ }
} else {
reqStr.append("User-Agent: ");
reqStr.append(userAgent);
reqStr.append("\r\n");
}
-// if (page.isReadable(WebPage.Field.MODIFIED_TIME.getIndex())) {
- reqStr.append("If-Modified-Since: " +
- HttpDateFormat.toString(page.getModifiedTime()));
- reqStr.append("\r\n");
-// }
+ // if (page.isReadable(WebPage.Field.MODIFIED_TIME.getIndex())) {
+ reqStr.append("If-Modified-Since: "
+ + HttpDateFormat.toString(page.getModifiedTime()));
+ reqStr.append("\r\n");
+ // }
reqStr.append("\r\n");
- byte[] reqBytes= reqStr.toString().getBytes();
+ byte[] reqBytes = reqStr.toString().getBytes();
req.write(reqBytes);
req.flush();
- PushbackInputStream in = // process response
- new PushbackInputStream(
- new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
- Http.BUFFER_SIZE) ;
+ PushbackInputStream in = // process response
+ new PushbackInputStream(new BufferedInputStream(socket.getInputStream(),
+ Http.BUFFER_SIZE), Http.BUFFER_SIZE);
StringBuffer line = new StringBuffer();
- boolean haveSeenNonContinueStatus= false;
+ boolean haveSeenNonContinueStatus = false;
while (!haveSeenNonContinueStatus) {
// parse status code line
this.code = parseStatusLine(in, line);
// parse headers
parseHeaders(in, line);
- haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
+ haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
}
String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
@@ -228,10 +234,10 @@ public class HttpResponse implements Res
}
-
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
public URL getUrl() {
return url;
@@ -253,15 +259,15 @@ public class HttpResponse implements Res
return content;
}
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
-
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
- private void readPlainContent(InputStream in)
- throws HttpException, IOException {
+ private void readPlainContent(InputStream in) throws HttpException,
+ IOException {
- int contentLength = Integer.MAX_VALUE; // get content length
+ int contentLength = Integer.MAX_VALUE; // get content length
String contentLengthString = headers.get(Response.CONTENT_LENGTH);
if (contentLengthString != null) {
contentLengthString = contentLengthString.trim();
@@ -269,12 +275,13 @@ public class HttpResponse implements Res
if (!contentLengthString.isEmpty())
contentLength = Integer.parseInt(contentLengthString);
} catch (NumberFormatException e) {
- throw new HttpException("bad content length: "+contentLengthString);
+ throw new HttpException("bad content length: " + contentLengthString);
}
}
- if (http.getMaxContent() >= 0
- && contentLength > http.getMaxContent()) // limit download size
- contentLength = http.getMaxContent();
+ if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) // limit
+ // download
+ // size
+ contentLength = http.getMaxContent();
ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
byte[] bytes = new byte[Http.BUFFER_SIZE];
@@ -384,38 +391,37 @@ public class HttpResponse implements Res
parseHeaders(in, line);
}
-
+
private int parseStatusLine(PushbackInputStream in, StringBuffer line)
- throws IOException, HttpException {
+ throws IOException, HttpException {
readLine(in, line, false);
int codeStart = line.indexOf(" ");
- int codeEnd = line.indexOf(" ", codeStart+1);
+ int codeEnd = line.indexOf(" ", codeStart + 1);
// handle lines with no plaintext result code, ie:
// "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
if (codeEnd == -1)
- codeEnd= line.length();
+ codeEnd = line.length();
int code;
try {
- code= Integer.parseInt(line.substring(codeStart+1, codeEnd));
+ code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
} catch (NumberFormatException e) {
- throw new HttpException("bad status line '" + line
- + "': " + e.getMessage(), e);
+ throw new HttpException("bad status line '" + line + "': "
+ + e.getMessage(), e);
}
return code;
}
+ private void processHeaderLine(StringBuffer line) throws IOException,
+ HttpException {
- private void processHeaderLine(StringBuffer line)
- throws IOException, HttpException {
-
- int colonIndex = line.indexOf(":"); // key is up to colon
+ int colonIndex = line.indexOf(":"); // key is up to colon
if (colonIndex == -1) {
int i;
- for (i= 0; i < line.length(); i++)
+ for (i = 0; i < line.length(); i++)
if (!Character.isWhitespace(line.charAt(i)))
break;
if (i == line.length())
@@ -424,7 +430,7 @@ public class HttpResponse implements Res
}
String key = line.substring(0, colonIndex);
- int valueStart = colonIndex+1; // skip whitespace
+ int valueStart = colonIndex + 1; // skip whitespace
while (valueStart < line.length()) {
int c = line.charAt(valueStart);
if (c != ' ' && c != '\t')
@@ -435,28 +441,27 @@ public class HttpResponse implements Res
headers.set(key, value);
}
-
// Adds headers to our headers Metadata
private void parseHeaders(PushbackInputStream in, StringBuffer line)
- throws IOException, HttpException {
+ throws IOException, HttpException {
while (readLine(in, line, true) != 0) {
// handle HTTP responses with missing blank line after headers
int pos;
- if ( ((pos= line.indexOf("<!DOCTYPE")) != -1)
- || ((pos= line.indexOf("<HTML")) != -1)
- || ((pos= line.indexOf("<html")) != -1) ) {
+ if (((pos = line.indexOf("<!DOCTYPE")) != -1)
+ || ((pos = line.indexOf("<HTML")) != -1)
+ || ((pos = line.indexOf("<html")) != -1)) {
in.unread(line.substring(pos).getBytes("UTF-8"));
line.setLength(pos);
try {
- //TODO: (CM) We don't know the header names here
- //since we're just handling them generically. It would
- //be nice to provide some sort of mapping function here
- //for the returned header names to the standard metadata
- //names in the ParseData class
+ // TODO: (CM) We don't know the header names here
+ // since we're just handling them generically. It would
+ // be nice to provide some sort of mapping function here
+ // for the returned header names to the standard metadata
+ // names in the ParseData class
processHeaderLine(line);
} catch (Exception e) {
// fixme:
@@ -470,29 +475,29 @@ public class HttpResponse implements Res
}
private static int readLine(PushbackInputStream in, StringBuffer line,
- boolean allowContinuedLine)
- throws IOException {
+ boolean allowContinuedLine) throws IOException {
line.setLength(0);
for (int c = in.read(); c != -1; c = in.read()) {
switch (c) {
- case '\r':
- if (peek(in) == '\n') {
- in.read();
- }
- case '\n':
- if (line.length() > 0) {
- // at EOL -- check for continued line if the current
- // (possibly continued) line wasn't blank
- if (allowContinuedLine)
- switch (peek(in)) {
- case ' ' : case '\t': // line is continued
- in.read();
- continue;
- }
- }
- return line.length(); // else complete
- default :
- line.append((char)c);
+ case '\r':
+ if (peek(in) == '\n') {
+ in.read();
+ }
+ case '\n':
+ if (line.length() > 0) {
+ // at EOL -- check for continued line if the current
+ // (possibly continued) line wasn't blank
+ if (allowContinuedLine)
+ switch (peek(in)) {
+ case ' ':
+ case '\t': // line is continued
+ in.read();
+ continue;
+ }
+ }
+ return line.length(); // else complete
+ default:
+ line.append((char) c);
}
}
throw new EOFException();
Modified: nutch/branches/2.x/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java Fri Jan 9 06:34:33 2015
@@ -34,7 +34,7 @@ import org.mortbay.jetty.servlet.Context
import org.mortbay.jetty.servlet.ServletHolder;
/**
- * Test cases for protocol-http
+ * Test cases for protocol-http
*/
public class TestProtocolHttp {
private static final String RES_DIR = System.getProperty("test.data", ".");
@@ -44,7 +44,7 @@ public class TestProtocolHttp {
private Context root;
private Configuration conf;
private int port;
-
+
public void setUp(boolean redirection) throws Exception {
this.conf = new Configuration();
this.conf.addResource("nutch-default.xml");
@@ -52,18 +52,18 @@ public class TestProtocolHttp {
this.http = new Http();
this.http.setConf(conf);
-
+
this.server = new Server();
-
+
if (redirection) {
this.root = new Context(server, "/redirection", Context.SESSIONS);
this.root.setAttribute("newContextURL", "/redirect");
- }
- else {
+ } else {
this.root = new Context(server, "/", Context.SESSIONS);
}
- ServletHolder sh = new ServletHolder(org.apache.jasper.servlet.JspServlet.class);
+ ServletHolder sh = new ServletHolder(
+ org.apache.jasper.servlet.JspServlet.class);
this.root.addServlet(sh, "*.jsp");
this.root.setResourceBase(RES_DIR);
}
@@ -89,12 +89,14 @@ public class TestProtocolHttp {
startServer(47500, true);
fetchPage("/redirection", 302);
}
-
+
/**
* Starts the Jetty server at a specified port and redirection parameter.
*
- * @param portno Port number.
- * @param redirection whether redirection
+ * @param portno
+ * Port number.
+ * @param redirection
+ * whether redirection
*/
private void startServer(int portno, boolean redirection) throws Exception {
port = portno;
@@ -123,11 +125,13 @@ public class TestProtocolHttp {
Response response = http.getResponse(url, p, true);
ProtocolOutput out = http.getProtocolOutput(url.toString(), p);
Content content = out.getContent();
-
- assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
- if (page.compareTo("/nonexists.html") != 0
- && page.compareTo("/brokenpage.jsp") != 0
- && page.compareTo("/redirection") != 0)
- assertEquals("ContentType " + url, "application/xhtml+xml", content.getContentType());
+
+ assertEquals("HTTP Status Code for " + url, expectedCode,
+ response.getCode());
+ if (page.compareTo("/nonexists.html") != 0
+ && page.compareTo("/brokenpage.jsp") != 0
+ && page.compareTo("/redirection") != 0)
+ assertEquals("ContentType " + url, "application/xhtml+xml",
+ content.getContentType());
}
}
Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java Fri Jan 9 06:34:33 2015
@@ -1,19 +1,19 @@
/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements. See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
/*
* Based on EasySSLProtocolSocketFactory from commons-httpclient:
*
@@ -41,10 +41,12 @@ import org.slf4j.LoggerFactory;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
-public class DummySSLProtocolSocketFactory implements SecureProtocolSocketFactory {
+public class DummySSLProtocolSocketFactory implements
+ SecureProtocolSocketFactory {
/** Logger object for this class. */
- private static final Logger LOG = LoggerFactory.getLogger(DummySSLProtocolSocketFactory.class);
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DummySSLProtocolSocketFactory.class);
private SSLContext sslcontext = null;
@@ -58,10 +60,13 @@ public class DummySSLProtocolSocketFacto
private static SSLContext createEasySSLContext() {
try {
SSLContext context = SSLContext.getInstance("SSL");
- context.init(null, new TrustManager[] { new DummyX509TrustManager(null) }, null);
+ context.init(null,
+ new TrustManager[] { new DummyX509TrustManager(null) }, null);
return context;
} catch (Exception e) {
- if (LOG.isErrorEnabled()) { LOG.error(e.getMessage(), e); }
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage(), e);
+ }
throw new HttpClientError(e.toString());
}
}
@@ -76,10 +81,11 @@ public class DummySSLProtocolSocketFacto
/**
* @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int,InetAddress,int)
*/
- public Socket createSocket(String host, int port, InetAddress clientHost, int clientPort) throws IOException,
- UnknownHostException {
+ public Socket createSocket(String host, int port, InetAddress clientHost,
+ int clientPort) throws IOException, UnknownHostException {
- return getSSLContext().getSocketFactory().createSocket(host, port, clientHost, clientPort);
+ return getSSLContext().getSocketFactory().createSocket(host, port,
+ clientHost, clientPort);
}
/**
@@ -93,20 +99,28 @@ public class DummySSLProtocolSocketFacto
* throws an {@link ConnectTimeoutException}
* </p>
*
- * @param host the host name/IP
- * @param port the port on the host
- * @param localAddress the local host name/IP to bind the socket to
- * @param localPort the port on the local machine
- * @param params {@link HttpConnectionParams Http connection parameters}
+ * @param host
+ * the host name/IP
+ * @param port
+ * the port on the host
+ * @param localAddress
+ * the local host name/IP to bind the socket to
+ * @param localPort
+ * the port on the local machine
+ * @param params
+ * {@link HttpConnectionParams Http connection parameters}
*
* @return Socket a new socket
*
- * @throws IOException if an I/O error occurs while creating the socket
- * @throws UnknownHostException if the IP address of the host cannot be
- * determined
+ * @throws IOException
+ * if an I/O error occurs while creating the socket
+ * @throws UnknownHostException
+ * if the IP address of the host cannot be determined
*/
- public Socket createSocket(final String host, final int port, final InetAddress localAddress, final int localPort,
- final HttpConnectionParams params) throws IOException, UnknownHostException, ConnectTimeoutException {
+ public Socket createSocket(final String host, final int port,
+ final InetAddress localAddress, final int localPort,
+ final HttpConnectionParams params) throws IOException,
+ UnknownHostException, ConnectTimeoutException {
if (params == null) {
throw new IllegalArgumentException("Parameters may not be null");
}
@@ -115,27 +129,31 @@ public class DummySSLProtocolSocketFacto
return createSocket(host, port, localAddress, localPort);
} else {
// To be eventually deprecated when migrated to Java 1.4 or above
- return ControllerThreadSocketFactory.createSocket(this, host, port, localAddress, localPort, timeout);
+ return ControllerThreadSocketFactory.createSocket(this, host, port,
+ localAddress, localPort, timeout);
}
}
/**
* @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int)
*/
- public Socket createSocket(String host, int port) throws IOException, UnknownHostException {
+ public Socket createSocket(String host, int port) throws IOException,
+ UnknownHostException {
return getSSLContext().getSocketFactory().createSocket(host, port);
}
/**
* @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(Socket,String,int,boolean)
*/
- public Socket createSocket(Socket socket, String host, int port, boolean autoClose) throws IOException,
- UnknownHostException {
- return getSSLContext().getSocketFactory().createSocket(socket, host, port, autoClose);
+ public Socket createSocket(Socket socket, String host, int port,
+ boolean autoClose) throws IOException, UnknownHostException {
+ return getSSLContext().getSocketFactory().createSocket(socket, host, port,
+ autoClose);
}
public boolean equals(Object obj) {
- return ((obj != null) && obj.getClass().equals(DummySSLProtocolSocketFactory.class));
+ return ((obj != null) && obj.getClass().equals(
+ DummySSLProtocolSocketFactory.class));
}
public int hashCode() {
Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java Fri Jan 9 06:34:33 2015
@@ -1,19 +1,19 @@
/**
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements. See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
/*
* Based on EasyX509TrustManager from commons-httpclient.
*/
@@ -30,53 +30,57 @@ import javax.net.ssl.TrustManagerFactory
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
-public class DummyX509TrustManager implements X509TrustManager
-{
- private X509TrustManager standardTrustManager = null;
-
- /**
- * Constructor for DummyX509TrustManager.
- */
- public DummyX509TrustManager(KeyStore keystore) throws NoSuchAlgorithmException, KeyStoreException {
- super();
- String algo = TrustManagerFactory.getDefaultAlgorithm();
- TrustManagerFactory factory = TrustManagerFactory.getInstance(algo);
- factory.init(keystore);
- TrustManager[] trustmanagers = factory.getTrustManagers();
- if (trustmanagers.length == 0) {
- throw new NoSuchAlgorithmException(algo + " trust manager not supported");
- }
- this.standardTrustManager = (X509TrustManager)trustmanagers[0];
- }
-
- /**
- * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[], String)
- */
- public boolean isClientTrusted(X509Certificate[] certificates) {
- return true;
- }
-
- /**
- * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[], String)
- */
- public boolean isServerTrusted(X509Certificate[] certificates) {
- return true;
- }
+public class DummyX509TrustManager implements X509TrustManager {
+ private X509TrustManager standardTrustManager = null;
- /**
- * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
- */
- public X509Certificate[] getAcceptedIssuers() {
- return this.standardTrustManager.getAcceptedIssuers();
- }
+ /**
+ * Constructor for DummyX509TrustManager.
+ */
+ public DummyX509TrustManager(KeyStore keystore)
+ throws NoSuchAlgorithmException, KeyStoreException {
+ super();
+ String algo = TrustManagerFactory.getDefaultAlgorithm();
+ TrustManagerFactory factory = TrustManagerFactory.getInstance(algo);
+ factory.init(keystore);
+ TrustManager[] trustmanagers = factory.getTrustManagers();
+ if (trustmanagers.length == 0) {
+ throw new NoSuchAlgorithmException(algo + " trust manager not supported");
+ }
+ this.standardTrustManager = (X509TrustManager) trustmanagers[0];
+ }
+
+ /**
+ * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
+ * String)
+ */
+ public boolean isClientTrusted(X509Certificate[] certificates) {
+ return true;
+ }
+
+ /**
+ * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
+ * String)
+ */
+ public boolean isServerTrusted(X509Certificate[] certificates) {
+ return true;
+ }
+
+ /**
+ * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
+ */
+ public X509Certificate[] getAcceptedIssuers() {
+ return this.standardTrustManager.getAcceptedIssuers();
+ }
+
+ public void checkClientTrusted(X509Certificate[] arg0, String arg1)
+ throws CertificateException {
+ // do nothing
+
+ }
+
+ public void checkServerTrusted(X509Certificate[] arg0, String arg1)
+ throws CertificateException {
+ // do nothing
- public void checkClientTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
- // do nothing
-
- }
-
- public void checkServerTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
- // do nothing
-
- }
+ }
}
Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Fri Jan 9 06:34:33 2015
@@ -67,395 +67,383 @@ import org.apache.nutch.util.NutchConfig
*/
public class Http extends HttpBase {
- public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+ public static final Logger LOG = LoggerFactory.getLogger(Http.class);
- private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
+ private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
- // Since the Configuration has not yet been set,
- // then an unconfigured client is returned.
- private static HttpClient client = new HttpClient(connectionManager);
- private static String defaultUsername;
- private static String defaultPassword;
- private static String defaultRealm;
- private static String defaultScheme;
- private static String authFile;
- private static String agentHost;
- private static boolean authRulesRead = false;
- private static Configuration conf;
-
- int maxThreadsTotal = 10;
-
- private String proxyUsername;
- private String proxyPassword;
- private String proxyRealm;
-
- private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
-
- static {
- FIELDS.add(WebPage.Field.MODIFIED_TIME);
- FIELDS.add(WebPage.Field.HEADERS);
- }
-
- @Override
- public Collection<Field> getFields() {
- return FIELDS;
- }
-
- /**
- * Returns the configured HTTP client.
- *
- * @return HTTP client
- */
- static synchronized HttpClient getClient() {
- return client;
- }
-
- /**
- * Constructs this plugin.
- */
- public Http() {
- super(LOG);
- }
-
- /**
- * Reads the configuration from the Nutch configuration files and sets the
- * configuration.
- *
- * @param conf
- * Configuration
- */
- public void setConf(Configuration conf) {
- super.setConf(conf);
- Http.conf = conf;
- this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
- this.proxyUsername = conf.get("http.proxy.username", "");
- this.proxyPassword = conf.get("http.proxy.password", "");
- this.proxyRealm = conf.get("http.proxy.realm", "");
- agentHost = conf.get("http.agent.host", "");
- authFile = conf.get("http.auth.file", "");
- configureClient();
- try {
- setCredentials();
- } catch (Exception ex) {
- if (LOG.isErrorEnabled()) {
- LOG.error("Could not read " + authFile + " : "
- + ex.getMessage());
- }
- }
- }
-
- /**
- * Main method.
- *
- * @param args
- * Command line arguments
- */
- public static void main(String[] args) throws Exception {
- Http http = new Http();
- http.setConf(NutchConfiguration.create());
- main(http, args);
- }
-
- /**
- * Fetches the <code>url</code> with a configured HTTP client and gets the
- * response.
- *
- * @param url
- * URL to be fetched
- * @param datum
- * Crawl data
- * @param redirect
- * Follow redirects if and only if true
- * @return HTTP response
- */
- protected Response getResponse(URL url, WebPage page, boolean redirect)
- throws ProtocolException, IOException {
- resolveCredentials(url);
- return new HttpResponse(this, url, page, redirect);
- }
-
- /**
- * Configures the HTTP client
- */
- private void configureClient() {
-
- // Set up an HTTPS socket factory that accepts self-signed certs.
- ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
- Protocol https = new Protocol("https", factory, 443);
- Protocol.registerProtocol("https", https);
-
- HttpConnectionManagerParams params = connectionManager.getParams();
- params.setConnectionTimeout(timeout);
- params.setSoTimeout(timeout);
- params.setSendBufferSize(BUFFER_SIZE);
- params.setReceiveBufferSize(BUFFER_SIZE);
- params.setMaxTotalConnections(maxThreadsTotal);
-
- //Also set max connections per host to maxThreadsTotal since all threads
- //might be used to fetch from the same host - otherwise timeout errors can occur
- params.setDefaultMaxConnectionsPerHost(maxThreadsTotal);
-
- // executeMethod(HttpMethod) seems to ignore the connection timeout on
- // the connection manager.
- // set it explicitly on the HttpClient.
- client.getParams().setConnectionManagerTimeout(timeout);
-
- HostConfiguration hostConf = client.getHostConfiguration();
- ArrayList<Header> headers = new ArrayList<Header>();
- // Set the User Agent in the header
- headers.add(new Header("User-Agent", userAgent));
- // prefer English
- headers.add(new Header("Accept-Language",
- "en-us,en-gb,en;q=0.7,*;q=0.3"));
- // prefer UTF-8
- headers.add(new Header("Accept-Charset",
- "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
- // prefer understandable formats
- headers.add(new Header(
- "Accept",
- "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
- // accept gzipped content
- headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
- hostConf.getParams().setParameter("http.default-headers", headers);
-
- // HTTP proxy server details
- if (useProxy) {
- hostConf.setProxy(proxyHost, proxyPort);
-
- if (proxyUsername.length() > 0) {
-
- AuthScope proxyAuthScope = getAuthScope(this.proxyHost,
- this.proxyPort, this.proxyRealm);
-
- NTCredentials proxyCredentials = new NTCredentials(
- this.proxyUsername, this.proxyPassword, Http.agentHost,
- this.proxyRealm);
-
- client.getState().setProxyCredentials(proxyAuthScope,
- proxyCredentials);
- }
- }
-
- }
-
- /**
- * Reads authentication configuration file (defined as 'http.auth.file' in
- * Nutch configuration file) and sets the credentials for the configured
- * authentication scopes in the HTTP client object.
- *
- * @throws ParserConfigurationException
- * If a document builder can not be created.
- * @throws SAXException
- * If any parsing error occurs.
- * @throws IOException
- * If any I/O error occurs.
- */
- private static synchronized void setCredentials()
- throws ParserConfigurationException, SAXException, IOException {
-
- if (authRulesRead)
- return;
-
- authRulesRead = true; // Avoid re-attempting to read
-
- InputStream is = conf.getConfResourceAsInputStream(authFile);
- if (is != null) {
- Document doc = DocumentBuilderFactory.newInstance()
- .newDocumentBuilder().parse(is);
-
- Element rootElement = doc.getDocumentElement();
- if (!"auth-configuration".equals(rootElement.getTagName())) {
- if (LOG.isWarnEnabled())
- LOG.warn("Bad auth conf file: root element <"
- + rootElement.getTagName() + "> found in "
- + authFile + " - must be <auth-configuration>");
- }
-
- // For each set of credentials
- NodeList credList = rootElement.getChildNodes();
- for (int i = 0; i < credList.getLength(); i++) {
- Node credNode = credList.item(i);
- if (!(credNode instanceof Element))
- continue;
-
- Element credElement = (Element) credNode;
- if (!"credentials".equals(credElement.getTagName())) {
- if (LOG.isWarnEnabled())
- LOG.warn("Bad auth conf file: Element <"
- + credElement.getTagName()
- + "> not recognized in " + authFile
- + " - expected <credentials>");
- continue;
- }
-
- String username = credElement.getAttribute("username");
- String password = credElement.getAttribute("password");
-
- // For each authentication scope
- NodeList scopeList = credElement.getChildNodes();
- for (int j = 0; j < scopeList.getLength(); j++) {
- Node scopeNode = scopeList.item(j);
- if (!(scopeNode instanceof Element))
- continue;
-
- Element scopeElement = (Element) scopeNode;
-
- if ("default".equals(scopeElement.getTagName())) {
-
- // Determine realm and scheme, if any
- String realm = scopeElement.getAttribute("realm");
- String scheme = scopeElement.getAttribute("scheme");
-
- // Set default credentials
- defaultUsername = username;
- defaultPassword = password;
- defaultRealm = realm;
- defaultScheme = scheme;
-
- if (LOG.isTraceEnabled()) {
- LOG.trace("Credentials - username: " + username
- + "; set as default" + " for realm: "
- + realm + "; scheme: " + scheme);
- }
-
- } else if ("authscope".equals(scopeElement.getTagName())) {
-
- // Determine authentication scope details
- String host = scopeElement.getAttribute("host");
- int port = -1; // For setting port to AuthScope.ANY_PORT
- try {
- port = Integer.parseInt(scopeElement
- .getAttribute("port"));
- } catch (Exception ex) {
- // do nothing, port is already set to any port
- }
- String realm = scopeElement.getAttribute("realm");
- String scheme = scopeElement.getAttribute("scheme");
-
- // Set credentials for the determined scope
- AuthScope authScope = getAuthScope(host, port, realm,
- scheme);
- NTCredentials credentials = new NTCredentials(username,
- password, agentHost, realm);
-
- client.getState()
- .setCredentials(authScope, credentials);
-
- if (LOG.isTraceEnabled()) {
- LOG.trace("Credentials - username: " + username
- + "; set for AuthScope - " + "host: "
- + host + "; port: " + port + "; realm: "
- + realm + "; scheme: " + scheme);
- }
-
- } else {
- if (LOG.isWarnEnabled())
- LOG.warn("Bad auth conf file: Element <"
- + scopeElement.getTagName()
- + "> not recognized in " + authFile
- + " - expected <authscope>");
- }
- }
- is.close();
- }
- }
- }
-
- /**
- * If credentials for the authentication scope determined from the specified
- * <code>url</code> is not already set in the HTTP client, then this method
- * sets the default credentials to fetch the specified <code>url</code>. If
- * credentials are found for the authentication scope, the method returns
- * without altering the client.
- *
- * @param url
- * URL to be fetched
- */
- private void resolveCredentials(URL url) {
-
- if (defaultUsername != null && defaultUsername.length() > 0) {
-
- int port = url.getPort();
- if (port == -1) {
- if ("https".equals(url.getProtocol()))
- port = 443;
- else
- port = 80;
- }
-
- AuthScope scope = new AuthScope(url.getHost(), port);
-
- if (client.getState().getCredentials(scope) != null) {
- if (LOG.isTraceEnabled())
- LOG.trace("Pre-configured credentials with scope - host: "
- + url.getHost() + "; port: " + port
- + "; found for url: " + url);
-
- // Credentials are already configured, so do nothing and return
- return;
- }
-
- if (LOG.isTraceEnabled())
- LOG.trace("Pre-configured credentials with scope - host: "
- + url.getHost() + "; port: " + port
- + "; not found for url: " + url);
-
- AuthScope serverAuthScope = getAuthScope(url.getHost(), port,
- defaultRealm, defaultScheme);
-
- NTCredentials serverCredentials = new NTCredentials(
- defaultUsername, defaultPassword, agentHost, defaultRealm);
-
- client.getState()
- .setCredentials(serverAuthScope, serverCredentials);
- }
- }
-
- /**
- * Returns an authentication scope for the specified <code>host</code>,
- * <code>port</code>, <code>realm</code> and <code>scheme</code>.
- *
- * @param host
- * Host name or address.
- * @param port
- * Port number.
- * @param realm
- * Authentication realm.
- * @param scheme
- * Authentication scheme.
- */
- private static AuthScope getAuthScope(String host, int port, String realm,
- String scheme) {
-
- if (host.length() == 0)
- host = null;
-
- if (port < 0)
- port = -1;
-
- if (realm.length() == 0)
- realm = null;
-
- if (scheme.length() == 0)
- scheme = null;
-
- return new AuthScope(host, port, realm, scheme);
- }
-
- /**
- * Returns an authentication scope for the specified <code>host</code>,
- * <code>port</code> and <code>realm</code>.
- *
- * @param host
- * Host name or address.
- * @param port
- * Port number.
- * @param realm
- * Authentication realm.
- */
- private static AuthScope getAuthScope(String host, int port, String realm) {
+ // Since the Configuration has not yet been set,
+ // then an unconfigured client is returned.
+ private static HttpClient client = new HttpClient(connectionManager);
+ private static String defaultUsername;
+ private static String defaultPassword;
+ private static String defaultRealm;
+ private static String defaultScheme;
+ private static String authFile;
+ private static String agentHost;
+ private static boolean authRulesRead = false;
+ private static Configuration conf;
+
+ int maxThreadsTotal = 10;
+
+ private String proxyUsername;
+ private String proxyPassword;
+ private String proxyRealm;
+
+ private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+ static {
+ FIELDS.add(WebPage.Field.MODIFIED_TIME);
+ FIELDS.add(WebPage.Field.HEADERS);
+ }
+
+ @Override
+ public Collection<Field> getFields() {
+ return FIELDS;
+ }
+
+ /**
+ * Returns the configured HTTP client.
+ *
+ * @return HTTP client
+ */
+ static synchronized HttpClient getClient() {
+ return client;
+ }
+
+ /**
+ * Constructs this plugin.
+ */
+ public Http() {
+ super(LOG);
+ }
+
+ /**
+ * Reads the configuration from the Nutch configuration files and sets the
+ * configuration.
+ *
+ * @param conf
+ * Configuration
+ */
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ Http.conf = conf;
+ this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
+ this.proxyUsername = conf.get("http.proxy.username", "");
+ this.proxyPassword = conf.get("http.proxy.password", "");
+ this.proxyRealm = conf.get("http.proxy.realm", "");
+ agentHost = conf.get("http.agent.host", "");
+ authFile = conf.get("http.auth.file", "");
+ configureClient();
+ try {
+ setCredentials();
+ } catch (Exception ex) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error("Could not read " + authFile + " : " + ex.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Main method.
+ *
+ * @param args
+ * Command line arguments
+ */
+ public static void main(String[] args) throws Exception {
+ Http http = new Http();
+ http.setConf(NutchConfiguration.create());
+ main(http, args);
+ }
+
+ /**
+ * Fetches the <code>url</code> with a configured HTTP client and gets the
+ * response.
+ *
+ * @param url
+ * URL to be fetched
+ * @param datum
+ * Crawl data
+ * @param redirect
+ * Follow redirects if and only if true
+ * @return HTTP response
+ */
+ protected Response getResponse(URL url, WebPage page, boolean redirect)
+ throws ProtocolException, IOException {
+ resolveCredentials(url);
+ return new HttpResponse(this, url, page, redirect);
+ }
+
+ /**
+ * Configures the HTTP client
+ */
+ private void configureClient() {
+
+ // Set up an HTTPS socket factory that accepts self-signed certs.
+ ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
+ Protocol https = new Protocol("https", factory, 443);
+ Protocol.registerProtocol("https", https);
+
+ HttpConnectionManagerParams params = connectionManager.getParams();
+ params.setConnectionTimeout(timeout);
+ params.setSoTimeout(timeout);
+ params.setSendBufferSize(BUFFER_SIZE);
+ params.setReceiveBufferSize(BUFFER_SIZE);
+ params.setMaxTotalConnections(maxThreadsTotal);
+
+ // Also set max connections per host to maxThreadsTotal since all threads
+ // might be used to fetch from the same host - otherwise timeout errors can
+ // occur
+ params.setDefaultMaxConnectionsPerHost(maxThreadsTotal);
+
+ // executeMethod(HttpMethod) seems to ignore the connection timeout on
+ // the connection manager.
+ // set it explicitly on the HttpClient.
+ client.getParams().setConnectionManagerTimeout(timeout);
+
+ HostConfiguration hostConf = client.getHostConfiguration();
+ ArrayList<Header> headers = new ArrayList<Header>();
+ // Set the User Agent in the header
+ headers.add(new Header("User-Agent", userAgent));
+ // prefer English
+ headers.add(new Header("Accept-Language", "en-us,en-gb,en;q=0.7,*;q=0.3"));
+ // prefer UTF-8
+ headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
+ // prefer understandable formats
+ headers
+ .add(new Header(
+ "Accept",
+ "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
+ // accept gzipped content
+ headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
+ hostConf.getParams().setParameter("http.default-headers", headers);
+
+ // HTTP proxy server details
+ if (useProxy) {
+ hostConf.setProxy(proxyHost, proxyPort);
+
+ if (proxyUsername.length() > 0) {
+
+ AuthScope proxyAuthScope = getAuthScope(this.proxyHost, this.proxyPort,
+ this.proxyRealm);
+
+ NTCredentials proxyCredentials = new NTCredentials(this.proxyUsername,
+ this.proxyPassword, Http.agentHost, this.proxyRealm);
+
+ client.getState().setProxyCredentials(proxyAuthScope, proxyCredentials);
+ }
+ }
+
+ }
+
+ /**
+ * Reads authentication configuration file (defined as 'http.auth.file' in
+ * Nutch configuration file) and sets the credentials for the configured
+ * authentication scopes in the HTTP client object.
+ *
+ * @throws ParserConfigurationException
+ * If a document builder can not be created.
+ * @throws SAXException
+ * If any parsing error occurs.
+ * @throws IOException
+ * If any I/O error occurs.
+ */
+ private static synchronized void setCredentials()
+ throws ParserConfigurationException, SAXException, IOException {
+
+ if (authRulesRead)
+ return;
+
+ authRulesRead = true; // Avoid re-attempting to read
+
+ InputStream is = conf.getConfResourceAsInputStream(authFile);
+ if (is != null) {
+ Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
+ .parse(is);
+
+ Element rootElement = doc.getDocumentElement();
+ if (!"auth-configuration".equals(rootElement.getTagName())) {
+ if (LOG.isWarnEnabled())
+ LOG.warn("Bad auth conf file: root element <"
+ + rootElement.getTagName() + "> found in " + authFile
+ + " - must be <auth-configuration>");
+ }
+
+ // For each set of credentials
+ NodeList credList = rootElement.getChildNodes();
+ for (int i = 0; i < credList.getLength(); i++) {
+ Node credNode = credList.item(i);
+ if (!(credNode instanceof Element))
+ continue;
+
+ Element credElement = (Element) credNode;
+ if (!"credentials".equals(credElement.getTagName())) {
+ if (LOG.isWarnEnabled())
+ LOG.warn("Bad auth conf file: Element <" + credElement.getTagName()
+ + "> not recognized in " + authFile
+ + " - expected <credentials>");
+ continue;
+ }
+
+ String username = credElement.getAttribute("username");
+ String password = credElement.getAttribute("password");
+
+ // For each authentication scope
+ NodeList scopeList = credElement.getChildNodes();
+ for (int j = 0; j < scopeList.getLength(); j++) {
+ Node scopeNode = scopeList.item(j);
+ if (!(scopeNode instanceof Element))
+ continue;
+
+ Element scopeElement = (Element) scopeNode;
+
+ if ("default".equals(scopeElement.getTagName())) {
+
+ // Determine realm and scheme, if any
+ String realm = scopeElement.getAttribute("realm");
+ String scheme = scopeElement.getAttribute("scheme");
+
+ // Set default credentials
+ defaultUsername = username;
+ defaultPassword = password;
+ defaultRealm = realm;
+ defaultScheme = scheme;
+
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Credentials - username: " + username
+ + "; set as default" + " for realm: " + realm + "; scheme: "
+ + scheme);
+ }
+
+ } else if ("authscope".equals(scopeElement.getTagName())) {
+
+ // Determine authentication scope details
+ String host = scopeElement.getAttribute("host");
+ int port = -1; // For setting port to AuthScope.ANY_PORT
+ try {
+ port = Integer.parseInt(scopeElement.getAttribute("port"));
+ } catch (Exception ex) {
+ // do nothing, port is already set to any port
+ }
+ String realm = scopeElement.getAttribute("realm");
+ String scheme = scopeElement.getAttribute("scheme");
+
+ // Set credentials for the determined scope
+ AuthScope authScope = getAuthScope(host, port, realm, scheme);
+ NTCredentials credentials = new NTCredentials(username, password,
+ agentHost, realm);
+
+ client.getState().setCredentials(authScope, credentials);
+
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Credentials - username: " + username
+ + "; set for AuthScope - " + "host: " + host + "; port: "
+ + port + "; realm: " + realm + "; scheme: " + scheme);
+ }
+
+ } else {
+ if (LOG.isWarnEnabled())
+ LOG.warn("Bad auth conf file: Element <"
+ + scopeElement.getTagName() + "> not recognized in "
+ + authFile + " - expected <authscope>");
+ }
+ }
+ is.close();
+ }
+ }
+ }
+
+ /**
+ * If credentials for the authentication scope determined from the specified
+ * <code>url</code> is not already set in the HTTP client, then this method
+ * sets the default credentials to fetch the specified <code>url</code>. If
+ * credentials are found for the authentication scope, the method returns
+ * without altering the client.
+ *
+ * @param url
+ * URL to be fetched
+ */
+ private void resolveCredentials(URL url) {
+
+ if (defaultUsername != null && defaultUsername.length() > 0) {
+
+ int port = url.getPort();
+ if (port == -1) {
+ if ("https".equals(url.getProtocol()))
+ port = 443;
+ else
+ port = 80;
+ }
+
+ AuthScope scope = new AuthScope(url.getHost(), port);
+
+ if (client.getState().getCredentials(scope) != null) {
+ if (LOG.isTraceEnabled())
+ LOG.trace("Pre-configured credentials with scope - host: "
+ + url.getHost() + "; port: " + port + "; found for url: " + url);
+
+ // Credentials are already configured, so do nothing and return
+ return;
+ }
+
+ if (LOG.isTraceEnabled())
+ LOG.trace("Pre-configured credentials with scope - host: "
+ + url.getHost() + "; port: " + port + "; not found for url: " + url);
+
+ AuthScope serverAuthScope = getAuthScope(url.getHost(), port,
+ defaultRealm, defaultScheme);
+
+ NTCredentials serverCredentials = new NTCredentials(defaultUsername,
+ defaultPassword, agentHost, defaultRealm);
+
+ client.getState().setCredentials(serverAuthScope, serverCredentials);
+ }
+ }
+
+ /**
+ * Returns an authentication scope for the specified <code>host</code>,
+ * <code>port</code>, <code>realm</code> and <code>scheme</code>.
+ *
+ * @param host
+ * Host name or address.
+ * @param port
+ * Port number.
+ * @param realm
+ * Authentication realm.
+ * @param scheme
+ * Authentication scheme.
+ */
+ private static AuthScope getAuthScope(String host, int port, String realm,
+ String scheme) {
+
+ if (host.length() == 0)
+ host = null;
+
+ if (port < 0)
+ port = -1;
+
+ if (realm.length() == 0)
+ realm = null;
+
+ if (scheme.length() == 0)
+ scheme = null;
+
+ return new AuthScope(host, port, realm, scheme);
+ }
+
+ /**
+ * Returns an authentication scope for the specified <code>host</code>,
+ * <code>port</code> and <code>realm</code>.
+ *
+ * @param host
+ * Host name or address.
+ * @param port
+ * Port number.
+ * @param realm
+ * Authentication realm.
+ */
+ private static AuthScope getAuthScope(String host, int port, String realm) {
- return getAuthScope(host, port, realm, "");
- }
+ return getAuthScope(host, port, realm, "");
+ }
}
Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java Fri Jan 9 06:34:33 2015
@@ -15,32 +15,31 @@
* limitations under the License.
*/
package org.apache.nutch.protocol.httpclient;
-
+
import java.util.List;
/**
- * The base level of services required for Http Authentication
- *
+ * The base level of services required for Http Authentication
+ *
* @see HttpAuthenticationFactory
*
- * @author Matt Tencati
+ * @author Matt Tencati
*/
public interface HttpAuthentication {
- /**
- * Gets the credentials generated by the HttpAuthentication
- * object. May return null.
- *
- * @return The credentials value
- */
- public List getCredentials();
+ /**
+ * Gets the credentials generated by the HttpAuthentication object. May return
+ * null.
+ *
+ * @return The credentials value
+ */
+ public List getCredentials();
- /**
- * Gets the realm used by the HttpAuthentication object during creation.
- *
- * @return The realm value
- */
- public String getRealm();
+ /**
+ * Gets the realm used by the HttpAuthentication object during creation.
+ *
+ * @return The realm value
+ */
+ public String getRealm();
}
-
Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java Fri Jan 9 06:34:33 2015
@@ -26,40 +26,46 @@ public class HttpAuthenticationException
private static final long serialVersionUID = 1L;
- /**
- * Constructs a new exception with null as its detail message.
- */
- public HttpAuthenticationException() {
- super();
- }
-
- /**
- * Constructs a new exception with the specified detail message.
- *
- * @param message the detail message. The detail message is saved for later retrieval by the {@link Throwable#getMessage()} method.
- */
- public HttpAuthenticationException(String message) {
- super(message);
- }
-
- /**
- * Constructs a new exception with the specified message and cause.
- *
- * @param message the detail message. The detail message is saved for later retrieval by the {@link Throwable#getMessage()} method.
- * @param cause the cause (use {@link #getCause()} to retrieve the cause)
- */
- public HttpAuthenticationException(String message, Throwable cause) {
- super(message, cause);
- }
-
- /**
- * Constructs a new exception with the specified cause and detail message from
- * given clause if it is not null.
- *
- * @param cause the cause (use {@link #getCause()} to retrieve the cause)
- */
- public HttpAuthenticationException(Throwable cause) {
- super(cause);
- }
+ /**
+ * Constructs a new exception with null as its detail message.
+ */
+ public HttpAuthenticationException() {
+ super();
+ }
+
+ /**
+ * Constructs a new exception with the specified detail message.
+ *
+ * @param message
+ * the detail message. The detail message is saved for later
+ * retrieval by the {@link Throwable#getMessage()} method.
+ */
+ public HttpAuthenticationException(String message) {
+ super(message);
+ }
+
+ /**
+ * Constructs a new exception with the specified message and cause.
+ *
+ * @param message
+ * the detail message. The detail message is saved for later
+ * retrieval by the {@link Throwable#getMessage()} method.
+ * @param cause
+ * the cause (use {@link #getCause()} to retrieve the cause)
+ */
+ public HttpAuthenticationException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ /**
+ * Constructs a new exception with the specified cause and detail message from
+ * given clause if it is not null.
+ *
+ * @param cause
+ * the cause (use {@link #getCause()} to retrieve the cause)
+ */
+ public HttpAuthenticationException(Throwable cause) {
+ super(cause);
+ }
}
Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java Fri Jan 9 06:34:33 2015
@@ -34,12 +34,10 @@ import org.apache.hadoop.conf.Configurab
// Nutch imports
import org.apache.nutch.metadata.Metadata;
-
/**
- * Provides the Http protocol implementation
- * with the ability to authenticate when prompted. The goal is to provide
- * multiple authentication types but for now just the {@link HttpBasicAuthentication} authentication
- * type is provided.
+ * Provides the Http protocol implementation with the ability to authenticate
+ * when prompted. The goal is to provide multiple authentication types but for
+ * now just the {@link HttpBasicAuthentication} authentication type is provided.
*
* @see HttpBasicAuthentication
* @see Http
@@ -49,94 +47,96 @@ import org.apache.nutch.metadata.Metadat
*/
public class HttpAuthenticationFactory implements Configurable {
- /**
- * The HTTP Authentication (WWW-Authenticate) header which is returned
- * by a webserver requiring authentication.
- */
- public static final String WWW_AUTHENTICATE = "WWW-Authenticate";
-
- public static final Logger LOG = LoggerFactory.getLogger(HttpAuthenticationFactory.class);
-
- private static Map<?, ?> auths = new TreeMap<Object, Object>();
-
- private Configuration conf = null;
-
-
- public HttpAuthenticationFactory(Configuration conf) {
- setConf(conf);
- }
-
-
- /* ---------------------------------- *
- * <implementation:Configurable> *
- * ---------------------------------- */
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- //if (conf.getBoolean("http.auth.verbose", false)) {
- // LOG.setLevel(Level.FINE);
- //} else {
- // LOG.setLevel(Level.WARNING);
- //}
- }
-
- public Configuration getConf() {
- return conf;
- }
-
- /* ---------------------------------- *
- * <implementation:Configurable> *
- * ---------------------------------- */
-
-
- @SuppressWarnings("unchecked")
- public HttpAuthentication findAuthentication(Metadata header) {
-
- if (header == null) return null;
-
- try {
- Collection challenge = null;
- if (header instanceof Metadata) {
- Object o = header.get(WWW_AUTHENTICATE);
- if (o instanceof Collection) {
- challenge = (Collection<?>) o;
- } else {
- challenge = new ArrayList<String>();
- challenge.add(o.toString());
- }
- } else {
- String challengeString = header.get(WWW_AUTHENTICATE);
- if (challengeString != null) {
- challenge = new ArrayList<Object>();
- challenge.add(challengeString);
- }
- }
- if (challenge == null) {
- if (LOG.isTraceEnabled()) {
- LOG.trace("Authentication challenge is null");
- }
- return null;
- }
-
- Iterator<?> i = challenge.iterator();
- HttpAuthentication auth = null;
- while (i.hasNext() && auth == null) {
- String challengeString = (String)i.next();
- if (challengeString.equals("NTLM")) {
- challengeString="Basic realm=techweb";
- }
-
- if (LOG.isTraceEnabled()) {
- LOG.trace("Checking challengeString=" + challengeString);
- }
- auth = HttpBasicAuthentication.getAuthentication(challengeString, conf);
- if (auth != null) return auth;
-
- //TODO Add additional Authentication lookups here
- }
- } catch (Exception e) {
- LOG.error("Failed with following exception: ", e);
- }
+ /**
+ * The HTTP Authentication (WWW-Authenticate) header which is returned by a
+ * webserver requiring authentication.
+ */
+ public static final String WWW_AUTHENTICATE = "WWW-Authenticate";
+
+ public static final Logger LOG = LoggerFactory
+ .getLogger(HttpAuthenticationFactory.class);
+
+ private static Map<?, ?> auths = new TreeMap<Object, Object>();
+
+ private Configuration conf = null;
+
+ public HttpAuthenticationFactory(Configuration conf) {
+ setConf(conf);
+ }
+
+ /*
+ * ---------------------------------- * <implementation:Configurable> *
+ * ----------------------------------
+ */
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ // if (conf.getBoolean("http.auth.verbose", false)) {
+ // LOG.setLevel(Level.FINE);
+ // } else {
+ // LOG.setLevel(Level.WARNING);
+ // }
+ }
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ /*
+ * ---------------------------------- * <implementation:Configurable> *
+ * ----------------------------------
+ */
+
+ @SuppressWarnings("unchecked")
+ public HttpAuthentication findAuthentication(Metadata header) {
+
+ if (header == null)
+ return null;
+
+ try {
+ Collection challenge = null;
+ if (header instanceof Metadata) {
+ Object o = header.get(WWW_AUTHENTICATE);
+ if (o instanceof Collection) {
+ challenge = (Collection<?>) o;
+ } else {
+ challenge = new ArrayList<String>();
+ challenge.add(o.toString());
+ }
+ } else {
+ String challengeString = header.get(WWW_AUTHENTICATE);
+ if (challengeString != null) {
+ challenge = new ArrayList<Object>();
+ challenge.add(challengeString);
+ }
+ }
+ if (challenge == null) {
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Authentication challenge is null");
+ }
return null;
+ }
+
+ Iterator<?> i = challenge.iterator();
+ HttpAuthentication auth = null;
+ while (i.hasNext() && auth == null) {
+ String challengeString = (String) i.next();
+ if (challengeString.equals("NTLM")) {
+ challengeString = "Basic realm=techweb";
+ }
+
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Checking challengeString=" + challengeString);
+ }
+ auth = HttpBasicAuthentication.getAuthentication(challengeString, conf);
+ if (auth != null)
+ return auth;
+
+ // TODO Add additional Authentication lookups here
+ }
+ } catch (Exception e) {
+ LOG.error("Failed with following exception: ", e);
}
+ return null;
+ }
}
Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java Fri Jan 9 06:34:33 2015
@@ -35,154 +35,163 @@ import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configurable;
-
/**
- * Implementation of RFC 2617 Basic Authentication. Usernames and passwords are stored
- * in standard Nutch configuration files using the following properties:
- * http.auth.basic.<realm>.user
- * http.auth.basic.<realm>.pass
+ * Implementation of RFC 2617 Basic Authentication. Usernames and passwords are
+ * stored in standard Nutch configuration files using the following properties:
+ * http.auth.basic.<realm>.user http.auth.basic.<realm>.pass
*/
-public class HttpBasicAuthentication implements HttpAuthentication, Configurable {
+public class HttpBasicAuthentication implements HttpAuthentication,
+ Configurable {
- public static final Logger LOG = LoggerFactory.getLogger(HttpBasicAuthentication.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(HttpBasicAuthentication.class);
- private static Pattern basic = Pattern.compile("[bB][aA][sS][iI][cC] [rR][eE][aA][lL][mM]=\"(\\w*)\"");
-
- private static Map<String, HttpBasicAuthentication> authMap = new TreeMap<String, HttpBasicAuthentication>();
-
- private Configuration conf = null;
- private String challenge = null;
- private ArrayList<String> credentials = null;
- private String realm = null;
-
-
- /**
- * Construct an HttpBasicAuthentication for the given challenge
- * parameters. The challenge parameters are returned by the web
- * server using a WWW-Authenticate header. This will typically be
- * represented by single line of the form <code>WWW-Authenticate: Basic realm="myrealm"</code>
- *
- * @param challenge WWW-Authenticate header from web server
- */
- protected HttpBasicAuthentication(String challenge, Configuration conf) throws HttpAuthenticationException {
-
- setConf(conf);
- this.challenge = challenge;
- credentials = new ArrayList<String>();
-
- String username = this.conf.get("http.auth.basic." + challenge + ".user");
- String password = this.conf.get("http.auth.basic." + challenge + ".password");
-
- if (LOG.isTraceEnabled()) {
- LOG.trace("BasicAuthentication challenge is " + challenge);
- LOG.trace("BasicAuthentication username=" + username);
- LOG.trace("BasicAuthentication password=" + password);
- }
-
- if (username == null) {
- throw new HttpAuthenticationException("Username for " + challenge + " is null");
- }
+ private static Pattern basic = Pattern
+ .compile("[bB][aA][sS][iI][cC] [rR][eE][aA][lL][mM]=\"(\\w*)\"");
- if (password == null) {
- throw new HttpAuthenticationException("Password for " + challenge + " is null");
+ private static Map<String, HttpBasicAuthentication> authMap = new TreeMap<String, HttpBasicAuthentication>();
+
+ private Configuration conf = null;
+ private String challenge = null;
+ private ArrayList<String> credentials = null;
+ private String realm = null;
+
+ /**
+ * Construct an HttpBasicAuthentication for the given challenge parameters.
+ * The challenge parameters are returned by the web server using a
+ * WWW-Authenticate header. This will typically be represented by single line
+ * of the form <code>WWW-Authenticate: Basic realm="myrealm"</code>
+ *
+ * @param challenge
+ * WWW-Authenticate header from web server
+ */
+ protected HttpBasicAuthentication(String challenge, Configuration conf)
+ throws HttpAuthenticationException {
+
+ setConf(conf);
+ this.challenge = challenge;
+ credentials = new ArrayList<String>();
+
+ String username = this.conf.get("http.auth.basic." + challenge + ".user");
+ String password = this.conf.get("http.auth.basic." + challenge
+ + ".password");
+
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("BasicAuthentication challenge is " + challenge);
+ LOG.trace("BasicAuthentication username=" + username);
+ LOG.trace("BasicAuthentication password=" + password);
+ }
+
+ if (username == null) {
+ throw new HttpAuthenticationException("Username for " + challenge
+ + " is null");
+ }
+
+ if (password == null) {
+ throw new HttpAuthenticationException("Password for " + challenge
+ + " is null");
+ }
+
+ byte[] credBytes = (username + ":" + password).getBytes();
+ credentials.add("Authorization: Basic "
+ + new String(Base64.encodeBase64(credBytes)));
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Basic credentials: " + credentials);
+ }
+ }
+
+ /*
+ * ---------------------------------- * <implementation:Configurable> *
+ * ----------------------------------
+ */
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ // if (conf.getBoolean("http.auth.verbose", false)) {
+ // LOG.setLevel(Level.FINE);
+ // } else {
+ // LOG.setLevel(Level.WARNING);
+ // }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /*
+ * ---------------------------------- * <implementation:Configurable> *
+ * ----------------------------------
+ */
+
+ /**
+ * Gets the Basic credentials generated by this HttpBasicAuthentication object
+ *
+ * @return Credentials in the form of
+ * <code>Authorization: Basic <Base64 encoded userid:password>
+ *
+ */
+ public List<String> getCredentials() {
+ return credentials;
+ }
+
+ /**
+ * Gets the realm attribute of the HttpBasicAuthentication object. This should
+ * have been supplied to the {@link #getAuthentication(String, Configuration)}
+ * static method
+ *
+ * @return The realm
+ */
+ public String getRealm() {
+ return realm;
+ }
+
+ /**
+ * This method is responsible for providing Basic authentication information.
+ * The method caches authentication information for each realm so that the
+ * required authentication information does not need to be regenerated for
+ * every request.
+ *
+ * @param challenge
+ * The challenge string provided by the webserver. This is the text
+ * which follows the WWW-Authenticate header, including the Basic
+ * tag.
+ * @return An HttpBasicAuthentication object or null if unable to generate
+ * appropriate credentials.
+ */
+ public static HttpBasicAuthentication getAuthentication(String challenge,
+ Configuration conf) {
+ if (challenge == null)
+ return null;
+ Matcher basicMatcher = basic.matcher(challenge);
+ if (basicMatcher.matches()) {
+ String realm = basicMatcher.group(1);
+ Object auth = authMap.get(realm);
+ if (auth == null) {
+ HttpBasicAuthentication newAuth = null;
+ try {
+ newAuth = new HttpBasicAuthentication(realm, conf);
+ } catch (HttpAuthenticationException hae) {
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("HttpBasicAuthentication failed for " + challenge);
+ }
}
-
- byte[] credBytes = (username + ":" + password).getBytes();
- credentials.add("Authorization: Basic " + new String(Base64.encodeBase64(credBytes)));
- if (LOG.isTraceEnabled()) {
- LOG.trace("Basic credentials: " + credentials);
- }
- }
-
-
- /* ---------------------------------- *
- * <implementation:Configurable> *
- * ---------------------------------- */
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- //if (conf.getBoolean("http.auth.verbose", false)) {
- // LOG.setLevel(Level.FINE);
- //} else {
- // LOG.setLevel(Level.WARNING);
- //}
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-
- /* ---------------------------------- *
- * <implementation:Configurable> *
- * ---------------------------------- */
-
-
- /**
- * Gets the Basic credentials generated by this
- * HttpBasicAuthentication object
- *
- * @return Credentials in the form of <code>Authorization: Basic <Base64 encoded userid:password>
- *
- */
- public List<String> getCredentials() {
- return credentials;
- }
-
-
- /**
- * Gets the realm attribute of the HttpBasicAuthentication object.
- * This should have been supplied to the {@link #getAuthentication(String, Configuration)}
- * static method
- *
- * @return The realm
- */
- public String getRealm() {
- return realm;
- }
-
- /**
- * This method is responsible for providing Basic authentication information. The
- * method caches authentication information for each realm so that the required
- * authentication information does not need to be regenerated for every request.
- *
- * @param challenge The challenge string provided by the webserver. This is the
- * text which follows the WWW-Authenticate header, including the Basic tag.
- * @return An HttpBasicAuthentication object or null
- * if unable to generate appropriate credentials.
- */
- public static HttpBasicAuthentication getAuthentication(String challenge, Configuration conf) {
- if (challenge == null) return null;
- Matcher basicMatcher = basic.matcher(challenge);
- if (basicMatcher.matches()) {
- String realm = basicMatcher.group(1);
- Object auth = authMap.get(realm);
- if (auth == null) {
- HttpBasicAuthentication newAuth = null;
- try {
- newAuth = new HttpBasicAuthentication(realm, conf);
- } catch (HttpAuthenticationException hae) {
- if (LOG.isTraceEnabled()) {
- LOG.trace("HttpBasicAuthentication failed for " + challenge);
- }
- }
- authMap.put(realm, newAuth);
- return newAuth;
- } else {
- return (HttpBasicAuthentication) auth;
- }
- }
- return null;
- }
-
- /**
- * Provides a pattern which can be used by an outside resource to determine if
- * this class can provide credentials based on simple header information. It does
- * not calculate any information regarding realms or challenges.
- *
- * @return Returns a Pattern which will match a Basic WWW-Authenticate header.
- */
- public static final Pattern getBasicPattern() {
- return basic;
- }
+ authMap.put(realm, newAuth);
+ return newAuth;
+ } else {
+ return (HttpBasicAuthentication) auth;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Provides a pattern which can be used by an outside resource to determine if
+ * this class can provide credentials based on simple header information. It
+ * does not calculate any information regarding realms or challenges.
+ *
+ * @return Returns a Pattern which will match a Basic WWW-Authenticate header.
+ */
+ public static final Pattern getBasicPattern() {
+ return basic;
+ }
}
-
Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Fri Jan 9 06:34:33 2015
@@ -41,7 +41,7 @@ import org.apache.nutch.storage.WebPage;
/**
* An HTTP response.
- *
+ *
* @author Susam Pal
*/
public class HttpResponse implements Response {
@@ -53,18 +53,22 @@ public class HttpResponse implements Res
/**
* Fetches the given <code>url</code> and prepares HTTP response.
- *
- * @param http An instance of the implementation class
- * of this plugin
- * @param url URL to be fetched
- * @param page WebPage
- * @param followRedirects Whether to follow redirects; follows
- * redirect if and only if this is true
- * @return HTTP response
- * @throws IOException When an error occurs
+ *
+ * @param http
+ * An instance of the implementation class of this plugin
+ * @param url
+ * URL to be fetched
+ * @param page
+ * WebPage
+ * @param followRedirects
+ * Whether to follow redirects; follows redirect if and only if this
+ * is true
+ * @return HTTP response
+ * @throws IOException
+ * When an error occurs
*/
- HttpResponse(Http http, URL url, WebPage page,
- boolean followRedirects) throws IOException {
+ HttpResponse(Http http, URL url, WebPage page, boolean followRedirects)
+ throws IOException {
// Prepare GET method for HTTP request
this.url = url;
@@ -99,7 +103,7 @@ public class HttpResponse implements Res
for (int i = 0; i < heads.length; i++) {
headers.set(heads[i].getName(), heads[i].getValue());
}
-
+
// Limit download size
int contentLength = Integer.MAX_VALUE;
String contentLengthString = headers.get(Response.CONTENT_LENGTH);
@@ -107,12 +111,10 @@ public class HttpResponse implements Res
try {
contentLength = Integer.parseInt(contentLengthString.trim());
} catch (NumberFormatException ex) {
- throw new HttpException("bad content length: " +
- contentLengthString);
+ throw new HttpException("bad content length: " + contentLengthString);
}
}
- if (http.getMaxContent() >= 0 &&
- contentLength > http.getMaxContent()) {
+ if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
contentLength = http.getMaxContent();
}
@@ -132,7 +134,8 @@ public class HttpResponse implements Res
content = out.toByteArray();
} catch (Exception e) {
- if (code == 200) throw new IOException(e.toString());
+ if (code == 200)
+ throw new IOException(e.toString());
// for codes other than 200 OK, we are fine with empty content
} finally {
if (in != null) {
@@ -140,16 +143,15 @@ public class HttpResponse implements Res
}
get.abort();
}
-
+
StringBuilder fetchTrace = null;
if (Http.LOG.isTraceEnabled()) {
// Trace message
- fetchTrace = new StringBuilder("url: " + url +
- "; status code: " + code +
- "; bytes received: " + content.length);
+ fetchTrace = new StringBuilder("url: " + url + "; status code: " + code
+ + "; bytes received: " + content.length);
if (getHeader(Response.CONTENT_LENGTH) != null)
- fetchTrace.append("; Content-Length: " +
- getHeader(Response.CONTENT_LENGTH));
+ fetchTrace.append("; Content-Length: "
+ + getHeader(Response.CONTENT_LENGTH));
if (getHeader(Response.LOCATION) != null)
fetchTrace.append("; Location: " + getHeader(Response.LOCATION));
}
@@ -159,8 +161,7 @@ public class HttpResponse implements Res
String contentEncoding = headers.get(Response.CONTENT_ENCODING);
if (contentEncoding != null && Http.LOG.isTraceEnabled())
fetchTrace.append("; Content-Encoding: " + contentEncoding);
- if ("gzip".equals(contentEncoding) ||
- "x-gzip".equals(contentEncoding)) {
+ if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
content = http.processGzipEncoded(content, url);
if (Http.LOG.isTraceEnabled())
fetchTrace.append("; extracted to " + content.length + " bytes");
@@ -170,14 +171,14 @@ public class HttpResponse implements Res
fetchTrace.append("; extracted to " + content.length + " bytes");
}
}
-
+
// add headers in metadata to row
- if (page.getHeaders() != null) {
- page.getHeaders().clear();
- }
- for (String key : headers.names()) {
- page.getHeaders().put(new Utf8(key), new Utf8(headers.get(key)));
- }
+ if (page.getHeaders() != null) {
+ page.getHeaders().clear();
+ }
+ for (String key : headers.names()) {
+ page.getHeaders().put(new Utf8(key), new Utf8(headers.get(key)));
+ }
// Logger trace message
if (Http.LOG.isTraceEnabled()) {
@@ -188,15 +189,15 @@ public class HttpResponse implements Res
}
}
-
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
-
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
+
public URL getUrl() {
return url;
}
-
+
public int getCode() {
return code;
}
@@ -204,7 +205,7 @@ public class HttpResponse implements Res
public String getHeader(String name) {
return headers.get(name);
}
-
+
public Metadata getHeaders() {
return headers;
}
@@ -213,8 +214,8 @@ public class HttpResponse implements Res
return content;
}
- /* -------------------------- *
- * </implementation:Response> *
- * -------------------------- */
+ /*
+ * -------------------------- * </implementation:Response> *
+ * --------------------------
+ */
}
-