You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2017/02/21 11:16:38 UTC

nutch git commit: NUTCH-2355 Protocol plugins to set cookie if Cookie metadata field is present

Repository: nutch
Updated Branches:
  refs/heads/master 9a9c4b32b -> 217fad16b


NUTCH-2355 Protocol plugins to set cookie if Cookie metadata field is present


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/217fad16
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/217fad16
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/217fad16

Branch: refs/heads/master
Commit: 217fad16bfdea0494390e8f170d9350cf06657ef
Parents: 9a9c4b3
Author: Markus Jelsma <ma...@apache.org>
Authored: Tue Feb 21 11:55:33 2017 +0100
Committer: Markus Jelsma <ma...@apache.org>
Committed: Tue Feb 21 11:55:33 2017 +0100

----------------------------------------------------------------------
 conf/nutch-default.xml                               |  8 ++++++++
 .../org/apache/nutch/protocol/http/api/HttpBase.java | 15 +++++++++++++--
 .../org/apache/nutch/protocol/http/HttpResponse.java | 11 +++++++++--
 .../nutch/protocol/httpclient/HttpResponse.java      |  7 +++++++
 4 files changed, 37 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/217fad16/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index ea7df89..08fb8a0 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -344,6 +344,14 @@
   </description>
 </property>
 
+<property>
+  <name>http.enable.cookie.header</name>
+  <value>true</value>
+  <description>Whether Nutch sends an HTTP Cookie header. The cookie value
+  is read from the CrawlDatum Cookie metadata field.
+  </description>
+</property>
+
 <!-- FTP properties -->
 
 <property>

http://git-wip-us.apache.org/repos/asf/nutch/blob/217fad16/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 41b63e3..eb3eb60 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -59,6 +59,8 @@ public abstract class HttpBase implements Protocol {
 
   public static final Text RESPONSE_TIME = new Text("_rs_");
 
+  public static final Text COOKIE = new Text("Cookie");
+  
   public static final int BUFFER_SIZE = 8 * 1024;
 
   private static final byte[] EMPTY_CONTENT = new byte[0];
@@ -124,7 +126,10 @@ public abstract class HttpBase implements Protocol {
   protected Set<String> tlsPreferredCipherSuites;
   
   /** Configuration directive for If-Modified-Since HTTP header */
-  public boolean enableIfModifiedsinceHeader = true;
+  protected boolean enableIfModifiedsinceHeader = true;
+  
+  /** Controls whether or not to set Cookie HTTP header based on CrawlDatum metadata */
+  protected boolean enableCookieHeader = true;
 
   /** Creates a new instance of HttpBase */
   public HttpBase() {
@@ -157,6 +162,7 @@ public abstract class HttpBase implements Protocol {
     this.useHttp11 = conf.getBoolean("http.useHttp11", false);
     this.responseTime = conf.getBoolean("http.store.responsetime", true);
     this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
+    this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", true);
     this.robots.setConf(conf);
 
     // NUTCH-1941: read list of alternating agent names
@@ -369,6 +375,10 @@ public abstract class HttpBase implements Protocol {
   public boolean isIfModifiedSinceEnabled() {
     return enableIfModifiedsinceHeader;
   }
+  
+  public boolean isCookieEnabled() {
+    return enableCookieHeader;
+  }
 
   public int getMaxContent() {
     return maxContent;
@@ -458,6 +468,7 @@ public abstract class HttpBase implements Protocol {
       logger.info("http.agent = " + userAgent);
       logger.info("http.accept.language = " + acceptLanguage);
       logger.info("http.accept = " + accept);
+      logger.info("http.enable.cookie.header = " + isCookieEnabled());
     }
   }
 
@@ -584,4 +595,4 @@ public abstract class HttpBase implements Protocol {
     }
     return hm;
   }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/217fad16/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index f6d7e4d..d984dc4 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -34,6 +34,7 @@ import javax.net.ssl.SSLSocket;
 import javax.net.ssl.SSLSocketFactory;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.SpellCheckedMetadata;
@@ -58,7 +59,7 @@ public class HttpResponse implements Response {
   private Metadata headers = new SpellCheckedMetadata();
   // used for storing the http headers verbatim
   private StringBuffer httpHeaders;
-
+  
   protected enum Scheme {
     HTTP, HTTPS,
   }
@@ -195,6 +196,13 @@ public class HttpResponse implements Response {
       reqStr.append("Accept: ");
       reqStr.append(this.http.getAccept());
       reqStr.append("\r\n");
+      
+      if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) {
+        String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
+        reqStr.append("Cookie: ");
+        reqStr.append(cookie);
+        reqStr.append("\r\n");
+      }
 
       if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
         reqStr.append("If-Modified-Since: " + HttpDateFormat
@@ -554,5 +562,4 @@ public class HttpResponse implements Response {
     in.unread(value);
     return value;
   }
-
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/217fad16/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
index f074af2..6041e13 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
@@ -39,6 +39,7 @@ import org.apache.nutch.metadata.SpellCheckedMetadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.hadoop.io.Text;
 
 /**
  * An HTTP response.
@@ -96,6 +97,12 @@ public class HttpResponse implements Response {
     // XXX the request body was sent the method is not retried, so there is
     // XXX little danger in retrying...
     // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
+    
+    if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) {
+      String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
+      get.addRequestHeader("Cookie", cookie);
+    }
+    
     try {
       HttpClient client = Http.getClient();
       client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941