You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/05/16 15:32:35 UTC
svn commit: r1595193 - in /nutch/trunk: CHANGES.txt
src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
src/plugin/protocol-http/plugin.xml
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Author: jnioche
Date: Fri May 16 13:32:35 2014
New Revision: 1595193
URL: http://svn.apache.org/r1595193
Log:
NUTCH-1676 Add rudimentary SSL support to protocol-http
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/trunk/src/plugin/protocol-http/plugin.xml
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1595193&r1=1595192&r2=1595193&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri May 16 13:32:35 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus)
+
* NUTCH-1772 Injector does not need merging if no pre-existing crawldb (jnioche)
* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel)
Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1595193&r1=1595192&r2=1595193&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Fri May 16 13:32:35 2014
@@ -19,6 +19,9 @@ package org.apache.nutch.protocol.http.a
// JDK imports
import java.io.IOException;
import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
// Logging imports
import org.slf4j.Logger;
@@ -43,9 +46,6 @@ import org.apache.hadoop.io.Text;
// crawler-commons imports
import crawlercommons.robots.BaseRobotRules;
-/**
- * @author Jérôme Charron
- */
public abstract class HttpBase implements Protocol {
public static final Text RESPONSE_TIME = new Text("_rs_");
@@ -103,6 +103,12 @@ public abstract class HttpBase implement
/** Skip page if Crawl-Delay longer than this value. */
protected long maxCrawlDelay = -1L;
+
+ /** Which TLS/SSL protocols to support */
+ protected Set<String> tlsPreferredProtocols;
+
+ /** Which TLS/SSL cipher suites to support */
+ protected Set<String> tlsPreferredCipherSuites;
/** Creates a new instance of HttpBase */
public HttpBase() {
@@ -133,6 +139,32 @@ public abstract class HttpBase implement
this.useHttp11 = conf.getBoolean("http.useHttp11", false);
this.responseTime = conf.getBoolean("http.store.responsetime", true);
this.robots.setConf(conf);
+
+ String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
+ String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
+ "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
+ "TLS_RSA_WITH_AES_256_CBC_SHA256","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
+ "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256","TLS_DHE_DSS_WITH_AES_256_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
+ "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA","TLS_RSA_WITH_AES_256_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
+ "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
+ "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256","TLS_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256","TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA",
+ "TLS_RSA_WITH_AES_128_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
+ "TLS_DHE_RSA_WITH_AES_128_CBC_SHA","TLS_DHE_DSS_WITH_AES_128_CBC_SHA","TLS_ECDHE_ECDSA_WITH_RC4_128_SHA",
+ "TLS_ECDHE_RSA_WITH_RC4_128_SHA","SSL_RSA_WITH_RC4_128_SHA","TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
+ "TLS_ECDH_RSA_WITH_RC4_128_SHA","TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA",
+ "SSL_RSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
+ "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA","SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA","SSL_RSA_WITH_RC4_128_MD5",
+ "TLS_EMPTY_RENEGOTIATION_INFO_SCSV","TLS_RSA_WITH_NULL_SHA256","TLS_ECDHE_ECDSA_WITH_NULL_SHA",
+ "TLS_ECDHE_RSA_WITH_NULL_SHA","SSL_RSA_WITH_NULL_SHA","TLS_ECDH_ECDSA_WITH_NULL_SHA","TLS_ECDH_RSA_WITH_NULL_SHA",
+ "SSL_RSA_WITH_NULL_MD5","SSL_RSA_WITH_DES_CBC_SHA","SSL_DHE_RSA_WITH_DES_CBC_SHA","SSL_DHE_DSS_WITH_DES_CBC_SHA",
+ "TLS_KRB5_WITH_RC4_128_SHA","TLS_KRB5_WITH_RC4_128_MD5","TLS_KRB5_WITH_3DES_EDE_CBC_SHA","TLS_KRB5_WITH_3DES_EDE_CBC_MD5",
+ "TLS_KRB5_WITH_DES_CBC_SHA","TLS_KRB5_WITH_DES_CBC_MD5");
+
+ tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
+ tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
+
logConf();
}
@@ -258,6 +290,14 @@ public abstract class HttpBase implement
return useHttp11;
}
+ public Set<String> getTlsPreferredCipherSuites() {
+ return tlsPreferredCipherSuites;
+ }
+
+ public Set<String> getTlsPreferredProtocols() {
+ return tlsPreferredProtocols;
+ }
+
private static String getAgentString(String agentName,
String agentVersion,
String agentDesc,
Modified: nutch/trunk/src/plugin/protocol-http/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/plugin.xml?rev=1595193&r1=1595192&r2=1595193&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-http/plugin.xml (original)
+++ nutch/trunk/src/plugin/protocol-http/plugin.xml Fri May 16 13:32:35 2014
@@ -40,6 +40,11 @@
class="org.apache.nutch.protocol.http.Http">
<parameter name="protocolName" value="http"/>
</implementation>
+
+ <implementation id="org.apache.nutch.protocol.http.Http"
+ class="org.apache.nutch.protocol.http.Http">
+ <parameter name="protocolName" value="https"/>
+ </implementation>
</extension>
Modified: nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1595193&r1=1595192&r2=1595193&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Fri May 16 13:32:35 2014
@@ -16,7 +16,6 @@
*/
package org.apache.nutch.protocol.http;
-// JDK imports
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.EOFException;
@@ -28,6 +27,13 @@ import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.net.ssl.SSLSocket;
+import javax.net.ssl.SSLSocketFactory;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
@@ -50,6 +56,11 @@ public class HttpResponse implements Res
private byte[] content;
private int code;
private Metadata headers = new SpellCheckedMetadata();
+
+ protected enum Scheme {
+ HTTP,
+ HTTPS,
+ }
/**
* Default public constructor.
@@ -66,9 +77,16 @@ public class HttpResponse implements Res
this.url = url;
this.orig = url.toString();
this.base = url.toString();
-
- if (!"http".equals(url.getProtocol()))
- throw new HttpException("Not an HTTP url:" + url);
+
+ Scheme scheme = null;
+
+ if ("http".equals(url.getProtocol())) {
+ scheme = Scheme.HTTP;
+ } else if ("https".equals(url.getProtocol())) {
+ scheme = Scheme.HTTPS;
+ } else {
+ throw new HttpException("Unknown scheme (not http/https) for url:" + url);
+ }
if (Http.LOG.isTraceEnabled()) {
Http.LOG.trace("fetching " + url);
@@ -84,7 +102,11 @@ public class HttpResponse implements Res
int port;
String portString;
if (url.getPort() == -1) {
- port= 80;
+ if (scheme == Scheme.HTTP) {
+ port = 80;
+ } else {
+ port = 443;
+ }
portString= "";
} else {
port= url.getPort();
@@ -102,6 +124,26 @@ public class HttpResponse implements Res
int sockPort = http.useProxy() ? http.getProxyPort() : port;
InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
+
+ if (scheme == Scheme.HTTPS) {
+ SSLSocketFactory factory = (SSLSocketFactory)SSLSocketFactory.getDefault();
+ SSLSocket sslsocket = (SSLSocket)factory.createSocket(socket, sockHost, sockPort, true);
+ sslsocket.setUseClientMode(true);
+
+ // Get the protocols and ciphers supported by this JVM
+ Set<String> protocols = new HashSet<String>(Arrays.asList(sslsocket.getSupportedProtocols()));
+ Set<String> ciphers = new HashSet<String>(Arrays.asList(sslsocket.getSupportedCipherSuites()));
+
+ // Intersect with preferred protocols and ciphers
+ protocols.retainAll(http.getTlsPreferredProtocols());
+ ciphers.retainAll(http.getTlsPreferredCipherSuites());
+
+ sslsocket.setEnabledProtocols(protocols.toArray(new String[protocols.size()]));
+ sslsocket.setEnabledCipherSuites(ciphers.toArray(new String[ciphers.size()]));
+
+ sslsocket.startHandshake();
+ socket = sslsocket;
+ }
this.conf = http.getConf();
if (sockAddr != null