You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2017/02/21 11:16:38 UTC
nutch git commit: NUTCH-2355 Protocol plugins to set cookie if Cookie
metadata field is present
Repository: nutch
Updated Branches:
refs/heads/master 9a9c4b32b -> 217fad16b
NUTCH-2355 Protocol plugins to set cookie if Cookie metadata field is present
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/217fad16
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/217fad16
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/217fad16
Branch: refs/heads/master
Commit: 217fad16bfdea0494390e8f170d9350cf06657ef
Parents: 9a9c4b3
Author: Markus Jelsma <ma...@apache.org>
Authored: Tue Feb 21 11:55:33 2017 +0100
Committer: Markus Jelsma <ma...@apache.org>
Committed: Tue Feb 21 11:55:33 2017 +0100
----------------------------------------------------------------------
conf/nutch-default.xml | 8 ++++++++
.../org/apache/nutch/protocol/http/api/HttpBase.java | 15 +++++++++++++--
.../org/apache/nutch/protocol/http/HttpResponse.java | 11 +++++++++--
.../nutch/protocol/httpclient/HttpResponse.java | 7 +++++++
4 files changed, 37 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/217fad16/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index ea7df89..08fb8a0 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -344,6 +344,14 @@
</description>
</property>
+<property>
+ <name>http.enable.cookie.header</name>
+ <value>true</value>
+ <description>Whether Nutch sends an HTTP Cookie header. The cookie value
+ is read from the CrawlDatum Cookie metadata field.
+ </description>
+</property>
+
<!-- FTP properties -->
<property>
http://git-wip-us.apache.org/repos/asf/nutch/blob/217fad16/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 41b63e3..eb3eb60 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -59,6 +59,8 @@ public abstract class HttpBase implements Protocol {
public static final Text RESPONSE_TIME = new Text("_rs_");
+ public static final Text COOKIE = new Text("Cookie");
+
public static final int BUFFER_SIZE = 8 * 1024;
private static final byte[] EMPTY_CONTENT = new byte[0];
@@ -124,7 +126,10 @@ public abstract class HttpBase implements Protocol {
protected Set<String> tlsPreferredCipherSuites;
/** Configuration directive for If-Modified-Since HTTP header */
- public boolean enableIfModifiedsinceHeader = true;
+ protected boolean enableIfModifiedsinceHeader = true;
+
+ /** Controls whether or not to set Cookie HTTP header based on CrawlDatum metadata */
+ protected boolean enableCookieHeader = true;
/** Creates a new instance of HttpBase */
public HttpBase() {
@@ -157,6 +162,7 @@ public abstract class HttpBase implements Protocol {
this.useHttp11 = conf.getBoolean("http.useHttp11", false);
this.responseTime = conf.getBoolean("http.store.responsetime", true);
this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
+ this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", true);
this.robots.setConf(conf);
// NUTCH-1941: read list of alternating agent names
@@ -369,6 +375,10 @@ public abstract class HttpBase implements Protocol {
public boolean isIfModifiedSinceEnabled() {
return enableIfModifiedsinceHeader;
}
+
+ public boolean isCookieEnabled() {
+ return enableCookieHeader;
+ }
public int getMaxContent() {
return maxContent;
@@ -458,6 +468,7 @@ public abstract class HttpBase implements Protocol {
logger.info("http.agent = " + userAgent);
logger.info("http.accept.language = " + acceptLanguage);
logger.info("http.accept = " + accept);
+ logger.info("http.enable.cookie.header = " + isCookieEnabled());
}
}
@@ -584,4 +595,4 @@ public abstract class HttpBase implements Protocol {
}
return hm;
}
-}
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/217fad16/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index f6d7e4d..d984dc4 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -34,6 +34,7 @@ import javax.net.ssl.SSLSocket;
import javax.net.ssl.SSLSocketFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.SpellCheckedMetadata;
@@ -58,7 +59,7 @@ public class HttpResponse implements Response {
private Metadata headers = new SpellCheckedMetadata();
// used for storing the http headers verbatim
private StringBuffer httpHeaders;
-
+
protected enum Scheme {
HTTP, HTTPS,
}
@@ -195,6 +196,13 @@ public class HttpResponse implements Response {
reqStr.append("Accept: ");
reqStr.append(this.http.getAccept());
reqStr.append("\r\n");
+
+ if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) {
+ String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
+ reqStr.append("Cookie: ");
+ reqStr.append(cookie);
+ reqStr.append("\r\n");
+ }
if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
reqStr.append("If-Modified-Since: " + HttpDateFormat
@@ -554,5 +562,4 @@ public class HttpResponse implements Response {
in.unread(value);
return value;
}
-
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/217fad16/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
index f074af2..6041e13 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
@@ -39,6 +39,7 @@ import org.apache.nutch.metadata.SpellCheckedMetadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.hadoop.io.Text;
/**
* An HTTP response.
@@ -96,6 +97,12 @@ public class HttpResponse implements Response {
// XXX the request body was sent the method is not retried, so there is
// XXX little danger in retrying...
// params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
+
+ if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) {
+ String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
+ get.addRequestHeader("Cookie", cookie);
+ }
+
try {
HttpClient client = Http.getClient();
client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941