You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/07/24 06:29:28 UTC
[1/3] nutch git commit: fix the cookie policy issue when the form
authentication receives session cookie in a non-standard format - NUTCH-2280
Repository: nutch
Updated Branches:
refs/heads/master fda3e148b -> 9f32fe84a
fix the cookie policy issue when the form authentication receives session cookie in a non-standard format - NUTCH-2280
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/993e997e
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/993e997e
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/993e997e
Branch: refs/heads/master
Commit: 993e997e2d5795c0623cdf5614d02c7a8ce405d5
Parents: 5943d11
Author: Steve Yao <st...@live.com>
Authored: Tue Jul 12 19:41:10 2016 +0800
Committer: Steve Yao <st...@live.com>
Committed: Tue Jul 12 19:41:10 2016 +0800
----------------------------------------------------------------------
.../apache/nutch/protocol/httpclient/Http.java | 79 ++++++++++++--------
.../httpclient/HttpFormAuthConfigurer.java | 21 +++++-
.../httpclient/HttpFormAuthentication.java | 28 +++++++
3 files changed, 95 insertions(+), 33 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/993e997e/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
index 75506ce..9b91180 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
@@ -69,16 +69,16 @@ import org.apache.nutch.util.NutchConfiguration;
* session.
* </p>
* <p>
- * Documentation can be found on the Nutch <a
- * href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes"
- * >HttpAuthenticationSchemes</a> wiki page.
+ * Documentation can be found on the Nutch
+ * <a href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes" >
+ * HttpAuthenticationSchemes</a> wiki page.
* </p>
* <p>
- * The original description of the motivation to support <a
- * href="https://wiki.apache.org/nutch/HttpPostAuthentication"
- * >HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally
- * HttpPostAuthentication development is documented at the <a
- * href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira
+ * The original description of the motivation to support
+ * <a href="https://wiki.apache.org/nutch/HttpPostAuthentication" >
+ * HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally
+ * HttpPostAuthentication development is documented at the
+ * <a href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira
* issue.
*
* @author Susam Pal
@@ -146,6 +146,7 @@ public class Http extends HttpBase {
setCredentials();
} catch (Exception ex) {
if (LOG.isErrorEnabled()) {
+ LOG.error("Http ", ex);
LOG.error("Could not read " + authFile + " : " + ex.getMessage());
}
}
@@ -202,15 +203,15 @@ public class Http extends HttpBase {
// NUTCH-1836: Modification to increase the number of available connections
// for multi-threaded crawls.
// --------------------------------------------------------------------------------
- params.setMaxTotalConnections(conf.getInt(
- "mapred.tasktracker.map.tasks.maximum", 5)
- * conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
+ params.setMaxTotalConnections(
+ conf.getInt("mapred.tasktracker.map.tasks.maximum", 5)
+ * conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
// Also set max connections per host to maxThreadsTotal since all threads
// might be used to fetch from the same host - otherwise timeout errors can
// occur
- params.setDefaultMaxConnectionsPerHost(conf.getInt(
- "fetcher.threads.fetch", maxThreadsTotal));
+ params.setDefaultMaxConnectionsPerHost(
+ conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
// executeMethod(HttpMethod) seems to ignore the connection timeout on the
// connection manager.
@@ -226,10 +227,8 @@ public class Http extends HttpBase {
// prefer UTF-8
headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
// prefer understandable formats
- headers
- .add(new Header(
- "Accept",
- "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
+ headers.add(new Header("Accept",
+ "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
// accept gzipped content
headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
hostConf.getParams().setParameter("http.default-headers", headers);
@@ -266,7 +265,6 @@ public class Http extends HttpBase {
*/
private static synchronized void setCredentials()
throws ParserConfigurationException, SAXException, IOException {
-
if (authRulesRead)
return;
@@ -333,9 +331,9 @@ public class Http extends HttpBase {
defaultScheme = scheme;
if (LOG.isTraceEnabled()) {
- LOG.trace("Credentials - username: " + username
- + "; set as default" + " for realm: " + realm + "; scheme: "
- + scheme);
+ LOG.trace(
+ "Credentials - username: " + username + "; set as default"
+ + " for realm: " + realm + "; scheme: " + scheme);
}
} else if ("authscope".equals(scopeElement.getTagName())) {
@@ -378,11 +376,15 @@ public class Http extends HttpBase {
/**
* <auth-configuration> <credentials authMethod="formAuth" loginUrl="loginUrl"
- * loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field
- * name="username" value="user1"/> </loginPostData> <additionalPostHeaders>
- * <field name="header1" value="vaule1"/> </additionalPostHeaders>
- * <removedFormFields> <field name="header1"/> </removedFormFields>
- * </credentials> </auth-configuration>
+ * loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field name
+ * ="username" value="user1"/> </loginPostData>
+ * <additionalPostHeaders> <field name="header1" value="vaule1"/>
+ * </additionalPostHeaders>
+ * <removedFormFields> <field name="header1"/> </removedFormFields> <!--
+ * NUTCH-2280: Add <loginCookie> and it sub-node <policy> nodes into the
+ * <credentials> node. The <policy> will mark the POST login form cookie
+ * policy. The value could be CookiePolicy.<ConstantValues>.
+ * --> </credentials> </auth-configuration>
*/
private static HttpFormAuthConfigurer readFormAuthConfigurer(
Element credElement, String authMethod) {
@@ -407,6 +409,7 @@ public class Http extends HttpBase {
}
NodeList nodeList = credElement.getChildNodes();
+
for (int j = 0; j < nodeList.getLength(); j++) {
Node node = nodeList.item(j);
if (!(node instanceof Element))
@@ -454,13 +457,28 @@ public class Http extends HttpBase {
removedFormFields.add(name);
}
formConfigurer.setRemovedFormFields(removedFormFields);
+ } else if ("loginCookie".equals(element.getTagName())) {
+ // NUTCH-2280
+ LOG.debug("start loginCookie");
+ NodeList childNodes = element.getChildNodes();
+ for (int k = 0; k < childNodes.getLength(); k++) {
+ Node fieldNode = childNodes.item(k);
+ if (!(fieldNode instanceof Element))
+ continue;
+ Element fieldElement = (Element) fieldNode;
+ if ("policy".equals(fieldElement.getTagName())) {
+ String policy = fieldElement.getTextContent();
+ formConfigurer.setCookiePolicy(policy);
+ LOG.debug("cookie policy is " + policy);
+ }
+ }
}
}
return formConfigurer;
} else {
- throw new IllegalArgumentException("Unsupported authMethod: "
- + authMethod);
+ throw new IllegalArgumentException(
+ "Unsupported authMethod: " + authMethod);
}
}
@@ -510,8 +528,9 @@ public class Http extends HttpBase {
}
if (LOG.isTraceEnabled())
- LOG.trace("Pre-configured credentials with scope - host: "
- + url.getHost() + "; port: " + port + "; not found for url: " + url);
+ LOG.trace(
+ "Pre-configured credentials with scope - host: " + url.getHost()
+ + "; port: " + port + "; not found for url: " + url);
AuthScope serverAuthScope = getAuthScope(url.getHost(), port,
defaultRealm, defaultScheme);
http://git-wip-us.apache.org/repos/asf/nutch/blob/993e997e/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
index b713ab6..f9cff36 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
@@ -33,8 +33,8 @@ public class HttpFormAuthConfigurer {
*/
private Map<String, String> additionalPostHeaders;
/**
- * If http post login returns redirect code: 301 or 302,
- * Http Client will automatically follow the redirect.
+ * If http post login returns redirect code: 301 or 302, Http Client will
+ * automatically follow the redirect.
*/
private boolean loginRedirect;
/**
@@ -42,6 +42,12 @@ public class HttpFormAuthConfigurer {
*/
private Set<String> removedFormFields;
+ /**
+ * Use this cookie policy to set the HttpClient cookie policy. This value
+ * should be DEFAULT BROWSER_COMPATIBILITY NETSCAPE RFC_2109
+ */
+ private String cookiePolicy;
+
public HttpFormAuthConfigurer() {
}
@@ -102,5 +108,14 @@ public class HttpFormAuthConfigurer {
public HttpFormAuthConfigurer setRemovedFormFields(
Set<String> removedFormFields) {
this.removedFormFields = removedFormFields;
- return this; }
+ return this;
+ }
+
+ public void setCookiePolicy(String policy) {
+ this.cookiePolicy = policy;
+ }
+
+ public String getCookiePolicy() {
+ return this.cookiePolicy;
+ }
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/993e997e/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
index 4c73f50..a6d4aa4 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
@@ -31,9 +31,12 @@ import java.util.Set;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
+import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.reflect.FieldUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -115,6 +118,11 @@ public class HttpFormAuthentication {
// Entity enclosing requests cannot be redirected without user
// intervention
setLoginHeader(post);
+
+ // NUTCH-2280
+ LOGGER.debug("FormAuth: set cookie policy");
+ this.setCookieParams(authConfigurer, post.getParams());
+
post.addParameters(params.toArray(new NameValuePair[0]));
int rspCode = client.executeMethod(post);
if (LOGGER.isDebugEnabled()) {
@@ -135,6 +143,26 @@ public class HttpFormAuthentication {
}
}
}
+
+ /**
+ * @throws NoSuchFieldException
+ * @throws SecurityException
+ * @throws IllegalArgumentException
+ * @throws IllegalAccessException
+ */
+ private void setCookieParams(HttpFormAuthConfigurer formConfigurer,
+ HttpMethodParams params)
+ throws NoSuchFieldException, SecurityException, IllegalArgumentException, IllegalAccessException {
+ // NUTCH-2280 - set the HttpClient cookie policy
+ if (formConfigurer.getCookiePolicy() != null) {
+ String policy = formConfigurer.getCookiePolicy();
+ Object p = FieldUtils.readDeclaredStaticField(CookiePolicy.class, policy);
+ if(null != p) {
+ LOGGER.debug("reflection of cookie value: " + p.toString());
+ params.setParameter(HttpMethodParams.COOKIE_POLICY, p);
+ }
+ }
+ }
private void setLoginHeader(PostMethod post) {
Map<String, String> headers = new HashMap<String, String>();
[2/3] nutch git commit: Format the HttpFormAuthentication.java with
eclipse format and add javadoc. Add the httpclient-auth.xml.template for
cookie policy config example.
Posted by le...@apache.org.
Format the HttpFormAuthentication.java with eclipse format and add javadoc. Add the httpclient-auth.xml.template for cookie policy config example.
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/753cad0b
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/753cad0b
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/753cad0b
Branch: refs/heads/master
Commit: 753cad0bd66ab525eb618d7f0e947eec034e207d
Parents: 993e997
Author: Steve Yao <st...@live.com>
Authored: Wed Jul 13 12:21:26 2016 +0800
Committer: Steve Yao <st...@live.com>
Committed: Wed Jul 13 12:21:26 2016 +0800
----------------------------------------------------------------------
conf/httpclient-auth.xml.template | 6 ++
.../httpclient/HttpFormAuthentication.java | 62 +++++++++++---------
2 files changed, 40 insertions(+), 28 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/753cad0b/conf/httpclient-auth.xml.template
----------------------------------------------------------------------
diff --git a/conf/httpclient-auth.xml.template b/conf/httpclient-auth.xml.template
index ce5ed7e..9d23093 100644
--- a/conf/httpclient-auth.xml.template
+++ b/conf/httpclient-auth.xml.template
@@ -82,6 +82,9 @@
<removedFormFields>
<field name="ctl00$MainContent$LoginUser$RememberMe"/>
</removedFormFields>
+ <loginCookie>
+ <policy>BROWSER_COMPATIBILITY</policy>
+ </loginCookie>
</credentials>
it is critical that the following fields are substituted:
@@ -98,6 +101,9 @@
the field and password respectively
* <field name="ctl00$MainContent$LoginUser$RememberMe"/>
- form element attributes for which we wish to skip fields
+ * <policy> value from <loginCookie> is a constant value symbol from
+ org.apache.commons.httpclient.cookie.CookiePolicy, like BROWSER_COMPATIBILITY,
+ DEFAULT, RFC_2109, etc.
More information on HTTP POST can be located at
https://wiki.apache.org/nutch/HttpPostAuthentication
http://git-wip-us.apache.org/repos/asf/nutch/blob/753cad0b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
index a6d4aa4..2f36538 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
@@ -51,8 +51,7 @@ public class HttpFormAuthentication {
static {
defaultLoginHeaders.put("User-Agent", "Mozilla/5.0");
- defaultLoginHeaders
- .put("Accept",
+ defaultLoginHeaders.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
defaultLoginHeaders.put("Accept-Language", "en-US,en;q=0.5");
defaultLoginHeaders.put("Connection", "keep-alive");
@@ -79,15 +78,12 @@ public class HttpFormAuthentication {
Set<String> removedFormFields) {
this.authConfigurer.setLoginUrl(loginUrl);
this.authConfigurer.setLoginFormId(loginForm);
- this.authConfigurer
- .setLoginPostData(loginPostData == null ? new HashMap<String, String>()
- : loginPostData);
- this.authConfigurer
- .setAdditionalPostHeaders(additionalPostHeaders == null ? new HashMap<String, String>()
- : additionalPostHeaders);
- this.authConfigurer
- .setRemovedFormFields(removedFormFields == null ? new HashSet<String>()
- : removedFormFields);
+ this.authConfigurer.setLoginPostData(
+ loginPostData == null ? new HashMap<String, String>() : loginPostData);
+ this.authConfigurer.setAdditionalPostHeaders(additionalPostHeaders == null
+ ? new HashMap<String, String>() : additionalPostHeaders);
+ this.authConfigurer.setRemovedFormFields(
+ removedFormFields == null ? new HashSet<String>() : removedFormFields);
this.client = new HttpClient();
}
@@ -118,11 +114,11 @@ public class HttpFormAuthentication {
// Entity enclosing requests cannot be redirected without user
// intervention
setLoginHeader(post);
-
+
// NUTCH-2280
LOGGER.debug("FormAuth: set cookie policy");
this.setCookieParams(authConfigurer, post.getParams());
-
+
post.addParameters(params.toArray(new NameValuePair[0]));
int rspCode = client.executeMethod(post);
if (LOGGER.isDebugEnabled()) {
@@ -143,25 +139,34 @@ public class HttpFormAuthentication {
}
}
}
-
+
/**
+ * NUTCH-2280 Set the cookie policy value from httpclient-auth.xml for the
+ * Post httpClient action.
+ *
+ * @param fromConfigurer
+ * - the httpclient-auth.xml values
+ *
+ * @param params
+ * - the HttpMethodParams from the current httpclient instance
+ *
* @throws NoSuchFieldException
* @throws SecurityException
* @throws IllegalArgumentException
* @throws IllegalAccessException
*/
private void setCookieParams(HttpFormAuthConfigurer formConfigurer,
- HttpMethodParams params)
- throws NoSuchFieldException, SecurityException, IllegalArgumentException, IllegalAccessException {
- // NUTCH-2280 - set the HttpClient cookie policy
- if (formConfigurer.getCookiePolicy() != null) {
- String policy = formConfigurer.getCookiePolicy();
- Object p = FieldUtils.readDeclaredStaticField(CookiePolicy.class, policy);
- if(null != p) {
- LOGGER.debug("reflection of cookie value: " + p.toString());
- params.setParameter(HttpMethodParams.COOKIE_POLICY, p);
- }
- }
+ HttpMethodParams params) throws NoSuchFieldException, SecurityException,
+ IllegalArgumentException, IllegalAccessException {
+ // NUTCH-2280 - set the HttpClient cookie policy
+ if (formConfigurer.getCookiePolicy() != null) {
+ String policy = formConfigurer.getCookiePolicy();
+ Object p = FieldUtils.readDeclaredStaticField(CookiePolicy.class, policy);
+ if (null != p) {
+ LOGGER.debug("reflection of cookie value: " + p.toString());
+ params.setParameter(HttpMethodParams.COOKIE_POLICY, p);
+ }
+ }
}
private void setLoginHeader(PostMethod post) {
@@ -204,12 +209,13 @@ public class HttpFormAuthentication {
if (loginform == null) {
LOGGER.debug("No form element found with 'id' = {}, trying 'name'.",
authConfigurer.getLoginFormId());
- loginform = doc.select("form[name="+ authConfigurer.getLoginFormId() + "]").first();
+ loginform = doc
+ .select("form[name=" + authConfigurer.getLoginFormId() + "]").first();
if (loginform == null) {
LOGGER.debug("No form element found with 'name' = {}",
authConfigurer.getLoginFormId());
- throw new IllegalArgumentException("No form exists: "
- + authConfigurer.getLoginFormId());
+ throw new IllegalArgumentException(
+ "No form exists: " + authConfigurer.getLoginFormId());
}
}
Elements inputElements = loginform.getElementsByTag("input");
[3/3] nutch git commit: Merge branch 'NUTCH-2280' of
https://github.com/stevegy/nutch this closes #134
Posted by le...@apache.org.
Merge branch 'NUTCH-2280' of https://github.com/stevegy/nutch this closes #134
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/9f32fe84
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/9f32fe84
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/9f32fe84
Branch: refs/heads/master
Commit: 9f32fe84a0a2ec1fc3761cb6b0c277584b0ed484
Parents: fda3e14 753cad0
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Sat Jul 23 23:13:33 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Sat Jul 23 23:13:33 2016 -0700
----------------------------------------------------------------------
conf/httpclient-auth.xml.template | 6 ++
.../apache/nutch/protocol/httpclient/Http.java | 79 ++++++++++++--------
.../httpclient/HttpFormAuthConfigurer.java | 21 +++++-
.../httpclient/HttpFormAuthentication.java | 62 +++++++++++----
4 files changed, 121 insertions(+), 47 deletions(-)
----------------------------------------------------------------------