You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/05/16 15:32:35 UTC

svn commit: r1595193 - in /nutch/trunk: CHANGES.txt src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java src/plugin/protocol-http/plugin.xml src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

Author: jnioche
Date: Fri May 16 13:32:35 2014
New Revision: 1595193

URL: http://svn.apache.org/r1595193
Log:
NUTCH-1676 Add rudimentary SSL support to protocol-http

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    nutch/trunk/src/plugin/protocol-http/plugin.xml
    nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1595193&r1=1595192&r2=1595193&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri May 16 13:32:35 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus)
+
 * NUTCH-1772 Injector does not need merging if no pre-existing crawldb (jnioche)
 
 * NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel)

Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1595193&r1=1595192&r2=1595193&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Fri May 16 13:32:35 2014
@@ -19,6 +19,9 @@ package org.apache.nutch.protocol.http.a
 // JDK imports
 import java.io.IOException;
 import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
 
 // Logging imports
 import org.slf4j.Logger;
@@ -43,9 +46,6 @@ import org.apache.hadoop.io.Text;
 // crawler-commons imports
 import crawlercommons.robots.BaseRobotRules;
 
-/**
- * @author Jérôme Charron
- */
 public abstract class HttpBase implements Protocol {
   
   public static final Text RESPONSE_TIME = new Text("_rs_");
@@ -103,6 +103,12 @@ public abstract class HttpBase implement
   
   /** Skip page if Crawl-Delay longer than this value. */
   protected long maxCrawlDelay = -1L;
+  
+  /** Which TLS/SSL protocols to support */
+  protected Set<String> tlsPreferredProtocols;
+  
+  /** Which TLS/SSL cipher suites to support */
+  protected Set<String> tlsPreferredCipherSuites;
 
   /** Creates a new instance of HttpBase */
   public HttpBase() {
@@ -133,6 +139,32 @@ public abstract class HttpBase implement
       this.useHttp11 = conf.getBoolean("http.useHttp11", false);
       this.responseTime = conf.getBoolean("http.store.responsetime", true);
       this.robots.setConf(conf);
+      
+      String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
+      String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", 
+          "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
+          "TLS_RSA_WITH_AES_256_CBC_SHA256","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
+          "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256","TLS_DHE_DSS_WITH_AES_256_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
+          "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA","TLS_RSA_WITH_AES_256_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
+          "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
+          "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256","TLS_RSA_WITH_AES_128_CBC_SHA256",
+          "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256","TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
+          "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA",
+          "TLS_RSA_WITH_AES_128_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
+          "TLS_DHE_RSA_WITH_AES_128_CBC_SHA","TLS_DHE_DSS_WITH_AES_128_CBC_SHA","TLS_ECDHE_ECDSA_WITH_RC4_128_SHA",
+          "TLS_ECDHE_RSA_WITH_RC4_128_SHA","SSL_RSA_WITH_RC4_128_SHA","TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
+          "TLS_ECDH_RSA_WITH_RC4_128_SHA","TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA",
+          "SSL_RSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
+          "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA","SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA","SSL_RSA_WITH_RC4_128_MD5",
+          "TLS_EMPTY_RENEGOTIATION_INFO_SCSV","TLS_RSA_WITH_NULL_SHA256","TLS_ECDHE_ECDSA_WITH_NULL_SHA",
+          "TLS_ECDHE_RSA_WITH_NULL_SHA","SSL_RSA_WITH_NULL_SHA","TLS_ECDH_ECDSA_WITH_NULL_SHA","TLS_ECDH_RSA_WITH_NULL_SHA",
+          "SSL_RSA_WITH_NULL_MD5","SSL_RSA_WITH_DES_CBC_SHA","SSL_DHE_RSA_WITH_DES_CBC_SHA","SSL_DHE_DSS_WITH_DES_CBC_SHA",
+          "TLS_KRB5_WITH_RC4_128_SHA","TLS_KRB5_WITH_RC4_128_MD5","TLS_KRB5_WITH_3DES_EDE_CBC_SHA","TLS_KRB5_WITH_3DES_EDE_CBC_MD5",
+          "TLS_KRB5_WITH_DES_CBC_SHA","TLS_KRB5_WITH_DES_CBC_MD5");
+
+      tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
+      tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
+
       logConf();
   }
 
@@ -258,6 +290,14 @@ public abstract class HttpBase implement
     return useHttp11;
   }
   
+  public Set<String> getTlsPreferredCipherSuites() {
+    return tlsPreferredCipherSuites;
+  }
+  
+  public Set<String> getTlsPreferredProtocols() {
+    return tlsPreferredProtocols;
+  }
+
   private static String getAgentString(String agentName,
                                        String agentVersion,
                                        String agentDesc,

Modified: nutch/trunk/src/plugin/protocol-http/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/plugin.xml?rev=1595193&r1=1595192&r2=1595193&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-http/plugin.xml (original)
+++ nutch/trunk/src/plugin/protocol-http/plugin.xml Fri May 16 13:32:35 2014
@@ -40,6 +40,11 @@
                       class="org.apache.nutch.protocol.http.Http">
         <parameter name="protocolName" value="http"/>
       </implementation>
+      
+      <implementation id="org.apache.nutch.protocol.http.Http"
+                       class="org.apache.nutch.protocol.http.Http">
+           <parameter name="protocolName" value="https"/>
+      </implementation>
 
    </extension>
 

Modified: nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1595193&r1=1595192&r2=1595193&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Fri May 16 13:32:35 2014
@@ -16,7 +16,6 @@
  */
 package org.apache.nutch.protocol.http;
 
-// JDK imports
 import java.io.BufferedInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.EOFException;
@@ -28,6 +27,13 @@ import java.net.InetSocketAddress;
 import java.net.Socket;
 import java.net.URL;
 
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+ 
+import javax.net.ssl.SSLSocket;
+import javax.net.ssl.SSLSocketFactory;
+
 import org.apache.hadoop.conf.Configuration;
 
 import org.apache.nutch.crawl.CrawlDatum;
@@ -50,6 +56,11 @@ public class HttpResponse implements Res
   private byte[] content;
   private int code;
   private Metadata headers = new SpellCheckedMetadata();
+  
+  protected enum Scheme {
+    HTTP,
+    HTTPS,
+  }
 
   /**
    * Default public constructor.
@@ -66,9 +77,16 @@ public class HttpResponse implements Res
     this.url = url;
     this.orig = url.toString();
     this.base = url.toString();
-
-    if (!"http".equals(url.getProtocol()))
-      throw new HttpException("Not an HTTP url:" + url);
+        
+    Scheme scheme = null;
+ 
+    if ("http".equals(url.getProtocol())) {
+      scheme = Scheme.HTTP;
+    } else if ("https".equals(url.getProtocol())) {
+      scheme = Scheme.HTTPS;
+    } else {
+      throw new HttpException("Unknown scheme (not http/https) for url:" + url);
+    }
 
     if (Http.LOG.isTraceEnabled()) {
       Http.LOG.trace("fetching " + url);
@@ -84,7 +102,11 @@ public class HttpResponse implements Res
     int port;
     String portString;
     if (url.getPort() == -1) {
-      port= 80;
+      if (scheme == Scheme.HTTP) {
+        port = 80;
+      } else {
+        port = 443;
+      }
       portString= "";
     } else {
       port= url.getPort();
@@ -102,6 +124,26 @@ public class HttpResponse implements Res
       int sockPort = http.useProxy() ? http.getProxyPort() : port;
       InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
       socket.connect(sockAddr, http.getTimeout());
+     
+      if (scheme == Scheme.HTTPS) {
+        SSLSocketFactory factory = (SSLSocketFactory)SSLSocketFactory.getDefault();
+        SSLSocket sslsocket = (SSLSocket)factory.createSocket(socket, sockHost, sockPort, true);
+        sslsocket.setUseClientMode(true);
+        
+        // Get the protocols and ciphers supported by this JVM    
+        Set<String> protocols = new HashSet<String>(Arrays.asList(sslsocket.getSupportedProtocols()));
+        Set<String> ciphers = new HashSet<String>(Arrays.asList(sslsocket.getSupportedCipherSuites()));
+        
+        // Intersect with preferred protocols and ciphers
+        protocols.retainAll(http.getTlsPreferredProtocols());
+        ciphers.retainAll(http.getTlsPreferredCipherSuites());
+        
+        sslsocket.setEnabledProtocols(protocols.toArray(new String[protocols.size()]));
+        sslsocket.setEnabledCipherSuites(ciphers.toArray(new String[ciphers.size()]));
+        
+        sslsocket.startHandshake();
+        socket = sslsocket;
+      }
       
       this.conf = http.getConf();
       if (sockAddr != null