You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/07/24 06:29:28 UTC

[1/3] nutch git commit: fix the cookie policy issue when the form authentication receives session cookie in a non-standard format - NUTCH-2280

Repository: nutch
Updated Branches:
  refs/heads/master fda3e148b -> 9f32fe84a


fix the cookie policy issue when the form authentication receives session cookie in a non-standard format - NUTCH-2280


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/993e997e
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/993e997e
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/993e997e

Branch: refs/heads/master
Commit: 993e997e2d5795c0623cdf5614d02c7a8ce405d5
Parents: 5943d11
Author: Steve Yao <st...@live.com>
Authored: Tue Jul 12 19:41:10 2016 +0800
Committer: Steve Yao <st...@live.com>
Committed: Tue Jul 12 19:41:10 2016 +0800

----------------------------------------------------------------------
 .../apache/nutch/protocol/httpclient/Http.java  | 79 ++++++++++++--------
 .../httpclient/HttpFormAuthConfigurer.java      | 21 +++++-
 .../httpclient/HttpFormAuthentication.java      | 28 +++++++
 3 files changed, 95 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/993e997e/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
index 75506ce..9b91180 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
@@ -69,16 +69,16 @@ import org.apache.nutch.util.NutchConfiguration;
  * session.
  * </p>
  * <p>
- * Documentation can be found on the Nutch <a
- * href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes"
- * >HttpAuthenticationSchemes</a> wiki page.
+ * Documentation can be found on the Nutch
+ * <a href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes" >
+ * HttpAuthenticationSchemes</a> wiki page.
  * </p>
  * <p>
- * The original description of the motivation to support <a
- * href="https://wiki.apache.org/nutch/HttpPostAuthentication"
- * >HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally
- * HttpPostAuthentication development is documented at the <a
- * href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira
+ * The original description of the motivation to support
+ * <a href="https://wiki.apache.org/nutch/HttpPostAuthentication" >
+ * HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally
+ * HttpPostAuthentication development is documented at the
+ * <a href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira
  * issue.
  * 
  * @author Susam Pal
@@ -146,6 +146,7 @@ public class Http extends HttpBase {
       setCredentials();
     } catch (Exception ex) {
       if (LOG.isErrorEnabled()) {
+        LOG.error("Http ", ex);
         LOG.error("Could not read " + authFile + " : " + ex.getMessage());
       }
     }
@@ -202,15 +203,15 @@ public class Http extends HttpBase {
     // NUTCH-1836: Modification to increase the number of available connections
     // for multi-threaded crawls.
     // --------------------------------------------------------------------------------
-    params.setMaxTotalConnections(conf.getInt(
-        "mapred.tasktracker.map.tasks.maximum", 5)
-        * conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
+    params.setMaxTotalConnections(
+        conf.getInt("mapred.tasktracker.map.tasks.maximum", 5)
+            * conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
 
     // Also set max connections per host to maxThreadsTotal since all threads
     // might be used to fetch from the same host - otherwise timeout errors can
     // occur
-    params.setDefaultMaxConnectionsPerHost(conf.getInt(
-        "fetcher.threads.fetch", maxThreadsTotal));
+    params.setDefaultMaxConnectionsPerHost(
+        conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
 
     // executeMethod(HttpMethod) seems to ignore the connection timeout on the
     // connection manager.
@@ -226,10 +227,8 @@ public class Http extends HttpBase {
     // prefer UTF-8
     headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
     // prefer understandable formats
-    headers
-        .add(new Header(
-            "Accept",
-            "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
+    headers.add(new Header("Accept",
+        "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
     // accept gzipped content
     headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
     hostConf.getParams().setParameter("http.default-headers", headers);
@@ -266,7 +265,6 @@ public class Http extends HttpBase {
    */
   private static synchronized void setCredentials()
       throws ParserConfigurationException, SAXException, IOException {
-
     if (authRulesRead)
       return;
 
@@ -333,9 +331,9 @@ public class Http extends HttpBase {
             defaultScheme = scheme;
 
             if (LOG.isTraceEnabled()) {
-              LOG.trace("Credentials - username: " + username
-                  + "; set as default" + " for realm: " + realm + "; scheme: "
-                  + scheme);
+              LOG.trace(
+                  "Credentials - username: " + username + "; set as default"
+                      + " for realm: " + realm + "; scheme: " + scheme);
             }
 
           } else if ("authscope".equals(scopeElement.getTagName())) {
@@ -378,11 +376,15 @@ public class Http extends HttpBase {
 
   /**
    * <auth-configuration> <credentials authMethod="formAuth" loginUrl="loginUrl"
-   * loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field
-   * name="username" value="user1"/> </loginPostData> <additionalPostHeaders>
-   * <field name="header1" value="vaule1"/> </additionalPostHeaders>
-   * <removedFormFields> <field name="header1"/> </removedFormFields>
-   * </credentials> </auth-configuration>
+   * loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field name
+   * ="username" value="user1"/> </loginPostData>
+   * <additionalPostHeaders> <field name="header1" value="vaule1"/>
+   * </additionalPostHeaders>
+   * <removedFormFields> <field name="header1"/> </removedFormFields> <!--
+   * NUTCH-2280: Add <loginCookie> and it sub-node <policy> nodes into the
+   * <credentials> node. The <policy> will mark the POST login form cookie
+   * policy. The value could be CookiePolicy.<ConstantValues>.
+   * --> </credentials> </auth-configuration>
    */
   private static HttpFormAuthConfigurer readFormAuthConfigurer(
       Element credElement, String authMethod) {
@@ -407,6 +409,7 @@ public class Http extends HttpBase {
       }
 
       NodeList nodeList = credElement.getChildNodes();
+
       for (int j = 0; j < nodeList.getLength(); j++) {
         Node node = nodeList.item(j);
         if (!(node instanceof Element))
@@ -454,13 +457,28 @@ public class Http extends HttpBase {
             removedFormFields.add(name);
           }
           formConfigurer.setRemovedFormFields(removedFormFields);
+        } else if ("loginCookie".equals(element.getTagName())) {
+          // NUTCH-2280
+          LOG.debug("start loginCookie");
+          NodeList childNodes = element.getChildNodes();
+          for (int k = 0; k < childNodes.getLength(); k++) {
+            Node fieldNode = childNodes.item(k);
+            if (!(fieldNode instanceof Element))
+              continue;
+            Element fieldElement = (Element) fieldNode;
+            if ("policy".equals(fieldElement.getTagName())) {
+              String policy = fieldElement.getTextContent();
+              formConfigurer.setCookiePolicy(policy);
+              LOG.debug("cookie policy is " + policy);
+            }
+          }
         }
       }
 
       return formConfigurer;
     } else {
-      throw new IllegalArgumentException("Unsupported authMethod: "
-          + authMethod);
+      throw new IllegalArgumentException(
+          "Unsupported authMethod: " + authMethod);
     }
   }
 
@@ -510,8 +528,9 @@ public class Http extends HttpBase {
       }
 
       if (LOG.isTraceEnabled())
-        LOG.trace("Pre-configured credentials with scope -  host: "
-            + url.getHost() + "; port: " + port + "; not found for url: " + url);
+        LOG.trace(
+            "Pre-configured credentials with scope -  host: " + url.getHost()
+                + "; port: " + port + "; not found for url: " + url);
 
       AuthScope serverAuthScope = getAuthScope(url.getHost(), port,
           defaultRealm, defaultScheme);

http://git-wip-us.apache.org/repos/asf/nutch/blob/993e997e/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
index b713ab6..f9cff36 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
@@ -33,8 +33,8 @@ public class HttpFormAuthConfigurer {
    */
   private Map<String, String> additionalPostHeaders;
   /**
-   * If http post login returns redirect code: 301 or 302, 
-   * Http Client will automatically follow the redirect.
+   * If http post login returns redirect code: 301 or 302, Http Client will
+   * automatically follow the redirect.
    */
   private boolean loginRedirect;
   /**
@@ -42,6 +42,12 @@ public class HttpFormAuthConfigurer {
    */
   private Set<String> removedFormFields;
 
+  /**
+   * Use this cookie policy to set the HttpClient cookie policy. This value
+   * should be DEFAULT BROWSER_COMPATIBILITY NETSCAPE RFC_2109
+   */
+  private String cookiePolicy;
+
   public HttpFormAuthConfigurer() {
   }
 
@@ -102,5 +108,14 @@ public class HttpFormAuthConfigurer {
   public HttpFormAuthConfigurer setRemovedFormFields(
       Set<String> removedFormFields) {
     this.removedFormFields = removedFormFields;
-    return this; }
+    return this;
+  }
+
+  public void setCookiePolicy(String policy) {
+    this.cookiePolicy = policy;
+  }
+
+  public String getCookiePolicy() {
+    return this.cookiePolicy;
+  }
 }

http://git-wip-us.apache.org/repos/asf/nutch/blob/993e997e/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
index 4c73f50..a6d4aa4 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
@@ -31,9 +31,12 @@ import java.util.Set;
 import org.apache.commons.httpclient.Header;
 import org.apache.commons.httpclient.HttpClient;
 import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.httpclient.cookie.CookiePolicy;
 import org.apache.commons.httpclient.methods.GetMethod;
 import org.apache.commons.httpclient.methods.PostMethod;
+import org.apache.commons.httpclient.params.HttpMethodParams;
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.reflect.FieldUtils;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@@ -115,6 +118,11 @@ public class HttpFormAuthentication {
       // Entity enclosing requests cannot be redirected without user
       // intervention
       setLoginHeader(post);
+      
+      // NUTCH-2280
+      LOGGER.debug("FormAuth: set cookie policy");
+      this.setCookieParams(authConfigurer, post.getParams());
+            
       post.addParameters(params.toArray(new NameValuePair[0]));
       int rspCode = client.executeMethod(post);
       if (LOGGER.isDebugEnabled()) {
@@ -135,6 +143,26 @@ public class HttpFormAuthentication {
       }
     }
   }
+  
+  /**
+   * @throws NoSuchFieldException
+   * @throws SecurityException
+   * @throws IllegalArgumentException
+   * @throws IllegalAccessException
+   */
+  private void setCookieParams(HttpFormAuthConfigurer formConfigurer,
+		  HttpMethodParams params)
+  		throws NoSuchFieldException, SecurityException, IllegalArgumentException, IllegalAccessException {
+  	// NUTCH-2280 - set the HttpClient cookie policy
+        if (formConfigurer.getCookiePolicy() != null) {
+      	  String policy = formConfigurer.getCookiePolicy();
+      	  Object p = FieldUtils.readDeclaredStaticField(CookiePolicy.class, policy);
+      	  if(null != p) {
+      		  LOGGER.debug("reflection of cookie value: " + p.toString());
+      		  params.setParameter(HttpMethodParams.COOKIE_POLICY, p);
+      	  }
+        }
+  }
 
   private void setLoginHeader(PostMethod post) {
     Map<String, String> headers = new HashMap<String, String>();


[2/3] nutch git commit: Format the HttpFormAuthentication.java with eclipse format and add javadoc. Add the httpclient-auth.xml.template for cookie policy config example.

Posted by le...@apache.org.
Format the HttpFormAuthentication.java with eclipse format and add javadoc. Add the httpclient-auth.xml.template for cookie policy config example.


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/753cad0b
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/753cad0b
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/753cad0b

Branch: refs/heads/master
Commit: 753cad0bd66ab525eb618d7f0e947eec034e207d
Parents: 993e997
Author: Steve Yao <st...@live.com>
Authored: Wed Jul 13 12:21:26 2016 +0800
Committer: Steve Yao <st...@live.com>
Committed: Wed Jul 13 12:21:26 2016 +0800

----------------------------------------------------------------------
 conf/httpclient-auth.xml.template               |  6 ++
 .../httpclient/HttpFormAuthentication.java      | 62 +++++++++++---------
 2 files changed, 40 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/753cad0b/conf/httpclient-auth.xml.template
----------------------------------------------------------------------
diff --git a/conf/httpclient-auth.xml.template b/conf/httpclient-auth.xml.template
index ce5ed7e..9d23093 100644
--- a/conf/httpclient-auth.xml.template
+++ b/conf/httpclient-auth.xml.template
@@ -82,6 +82,9 @@
      <removedFormFields>
        <field name="ctl00$MainContent$LoginUser$RememberMe"/>
      </removedFormFields>
+     <loginCookie>
+       <policy>BROWSER_COMPATIBILITY</policy>
+     </loginCookie>
    </credentials>
  
  it is critical that the following fields are substituted:
@@ -98,6 +101,9 @@
     the field and password respectively
   * <field name="ctl00$MainContent$LoginUser$RememberMe"/>
     - form element attributes for which we wish to skip fields
+  * <policy> value from <loginCookie> is a constant value symbol from 
+    org.apache.commons.httpclient.cookie.CookiePolicy, like BROWSER_COMPATIBILITY,
+    DEFAULT, RFC_2109, etc.
  
  More information on HTTP POST can be located at
  https://wiki.apache.org/nutch/HttpPostAuthentication

http://git-wip-us.apache.org/repos/asf/nutch/blob/753cad0b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
index a6d4aa4..2f36538 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
@@ -51,8 +51,7 @@ public class HttpFormAuthentication {
 
   static {
     defaultLoginHeaders.put("User-Agent", "Mozilla/5.0");
-    defaultLoginHeaders
-    .put("Accept",
+    defaultLoginHeaders.put("Accept",
         "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
     defaultLoginHeaders.put("Accept-Language", "en-US,en;q=0.5");
     defaultLoginHeaders.put("Connection", "keep-alive");
@@ -79,15 +78,12 @@ public class HttpFormAuthentication {
       Set<String> removedFormFields) {
     this.authConfigurer.setLoginUrl(loginUrl);
     this.authConfigurer.setLoginFormId(loginForm);
-    this.authConfigurer
-    .setLoginPostData(loginPostData == null ? new HashMap<String, String>()
-        : loginPostData);
-    this.authConfigurer
-    .setAdditionalPostHeaders(additionalPostHeaders == null ? new HashMap<String, String>()
-        : additionalPostHeaders);
-    this.authConfigurer
-    .setRemovedFormFields(removedFormFields == null ? new HashSet<String>()
-        : removedFormFields);
+    this.authConfigurer.setLoginPostData(
+        loginPostData == null ? new HashMap<String, String>() : loginPostData);
+    this.authConfigurer.setAdditionalPostHeaders(additionalPostHeaders == null
+        ? new HashMap<String, String>() : additionalPostHeaders);
+    this.authConfigurer.setRemovedFormFields(
+        removedFormFields == null ? new HashSet<String>() : removedFormFields);
     this.client = new HttpClient();
   }
 
@@ -118,11 +114,11 @@ public class HttpFormAuthentication {
       // Entity enclosing requests cannot be redirected without user
       // intervention
       setLoginHeader(post);
-      
+
       // NUTCH-2280
       LOGGER.debug("FormAuth: set cookie policy");
       this.setCookieParams(authConfigurer, post.getParams());
-            
+
       post.addParameters(params.toArray(new NameValuePair[0]));
       int rspCode = client.executeMethod(post);
       if (LOGGER.isDebugEnabled()) {
@@ -143,25 +139,34 @@ public class HttpFormAuthentication {
       }
     }
   }
-  
+
   /**
+   * NUTCH-2280 Set the cookie policy value from httpclient-auth.xml for the
+   * Post httpClient action.
+   * 
+   * @param fromConfigurer
+   *          - the httpclient-auth.xml values
+   * 
+   * @param params
+   *          - the HttpMethodParams from the current httpclient instance
+   * 
    * @throws NoSuchFieldException
    * @throws SecurityException
    * @throws IllegalArgumentException
    * @throws IllegalAccessException
    */
   private void setCookieParams(HttpFormAuthConfigurer formConfigurer,
-		  HttpMethodParams params)
-  		throws NoSuchFieldException, SecurityException, IllegalArgumentException, IllegalAccessException {
-  	// NUTCH-2280 - set the HttpClient cookie policy
-        if (formConfigurer.getCookiePolicy() != null) {
-      	  String policy = formConfigurer.getCookiePolicy();
-      	  Object p = FieldUtils.readDeclaredStaticField(CookiePolicy.class, policy);
-      	  if(null != p) {
-      		  LOGGER.debug("reflection of cookie value: " + p.toString());
-      		  params.setParameter(HttpMethodParams.COOKIE_POLICY, p);
-      	  }
-        }
+      HttpMethodParams params) throws NoSuchFieldException, SecurityException,
+      IllegalArgumentException, IllegalAccessException {
+    // NUTCH-2280 - set the HttpClient cookie policy
+    if (formConfigurer.getCookiePolicy() != null) {
+      String policy = formConfigurer.getCookiePolicy();
+      Object p = FieldUtils.readDeclaredStaticField(CookiePolicy.class, policy);
+      if (null != p) {
+        LOGGER.debug("reflection of cookie value: " + p.toString());
+        params.setParameter(HttpMethodParams.COOKIE_POLICY, p);
+      }
+    }
   }
 
   private void setLoginHeader(PostMethod post) {
@@ -204,12 +209,13 @@ public class HttpFormAuthentication {
     if (loginform == null) {
       LOGGER.debug("No form element found with 'id' = {}, trying 'name'.",
           authConfigurer.getLoginFormId());
-      loginform = doc.select("form[name="+ authConfigurer.getLoginFormId() + "]").first();
+      loginform = doc
+          .select("form[name=" + authConfigurer.getLoginFormId() + "]").first();
       if (loginform == null) {
         LOGGER.debug("No form element found with 'name' = {}",
             authConfigurer.getLoginFormId());
-        throw new IllegalArgumentException("No form exists: "
-            + authConfigurer.getLoginFormId());
+        throw new IllegalArgumentException(
+            "No form exists: " + authConfigurer.getLoginFormId());
       }
     }
     Elements inputElements = loginform.getElementsByTag("input");


[3/3] nutch git commit: Merge branch 'NUTCH-2280' of https://github.com/stevegy/nutch this closes #134

Posted by le...@apache.org.
Merge branch 'NUTCH-2280' of https://github.com/stevegy/nutch this closes #134


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/9f32fe84
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/9f32fe84
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/9f32fe84

Branch: refs/heads/master
Commit: 9f32fe84a0a2ec1fc3761cb6b0c277584b0ed484
Parents: fda3e14 753cad0
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Sat Jul 23 23:13:33 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Sat Jul 23 23:13:33 2016 -0700

----------------------------------------------------------------------
 conf/httpclient-auth.xml.template               |  6 ++
 .../apache/nutch/protocol/httpclient/Http.java  | 79 ++++++++++++--------
 .../httpclient/HttpFormAuthConfigurer.java      | 21 +++++-
 .../httpclient/HttpFormAuthentication.java      | 62 +++++++++++----
 4 files changed, 121 insertions(+), 47 deletions(-)
----------------------------------------------------------------------