You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/02/13 23:05:48 UTC

svn commit: r1659697 - in /nutch/trunk: CHANGES.txt conf/httpclient-auth.xml.template src/plugin/protocol-httpclient/ivy.xml src/plugin/protocol-httpclient/plugin.xml src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java

Author: lewismc
Date: Fri Feb 13 22:05:47 2015
New Revision: 1659697

URL: http://svn.apache.org/r1659697
Log:
NUTCH-827 HTTP POST Authentication

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/httpclient-auth.xml.template
    nutch/trunk/src/plugin/protocol-httpclient/ivy.xml
    nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
    nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659697&r1=1659696&r2=1659697&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Feb 13 22:05:47 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn, snagel, lewismc)
+
 * NUTCH-1724 LinkDBReader to support regex output filtering (markus)
 
 * NUTCH-1939 Fetcher fails to follow redirects (Leo Ye via snagel)

Modified: nutch/trunk/conf/httpclient-auth.xml.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/httpclient-auth.xml.template?rev=1659697&r1=1659696&r2=1659697&view=diff
==============================================================================
--- nutch/trunk/conf/httpclient-auth.xml.template (original)
+++ nutch/trunk/conf/httpclient-auth.xml.template Fri Feb 13 22:05:47 2015
@@ -54,6 +54,54 @@
 
   NTLM does not use the notion of realms. The domain name may be
   specified as the value for 'realm' attribute in case of NTLM.
+
+ More information on Basic, Digest and NTLM authentication
+ support can be located at https://wiki.apache.org/nutch/HttpAuthenticationSchemes
+
+ HTTP-POST Authentication Support
+ Http Form-based Authentication is a very common used authentication 
+ mechanism to protect web resources. We extend the 'auth-configuration' 
+ to include information about http form authentication properties as shown
+ in the following example:
+
+ Example:-
+   <credentials authMethod="formAuth"
+                loginUrl="http://localhost:44444/Account/Login.aspx"
+                loginFormId="ctl01"
+                loginRedirect="true">
+     <loginPostData>
+       <field name="ctl00$MainContent$LoginUser$UserName"
+              value="admin"/>
+       <field name="ctl00$MainContent$LoginUser$Password"
+              value="admin123"/>
+     </loginPostData>
+     <additionalPostHeaders>
+       <field name="User-Agent"
+              value="Mozilla/5.0 ... Firefox/35.0" />
+     </additionalPostHeaders>
+     <removedFormFields>
+       <field name="ctl00$MainContent$LoginUser$RememberMe"/>
+     </removedFormFields>
+   </credentials>
+ 
+ it is critical that the following fields are substituted:
+  * loginUrl - the URL containing the actual <form>
+  * loginFormId - the <form id="$formId" attribute value
+    (or the 'name' attribute if no form is referenced by 'id' attribute)
+  * loginRedirect - if http post login returns redirect code: 301 or 302,
+    and value is true, Http Client will automatically follow the redirect.
+  * <field name="ctl00$MainContent$LoginUser$UserName" value="admin"
+    - the <input name"name" and user defined username value used to represent
+    the field and username respectively
+  * <field name="ctl00$MainContent$LoginUser$Password" value="admin123"
+    - the <input name"name" and user defined password value used to represent
+    the field and password respectively
+  * <field name="ctl00$MainContent$LoginUser$RememberMe"/>
+    - form element attributes for which we wish to skip fields
+ 
+ More information on HTTP POST can be located at
+ https://wiki.apache.org/nutch/HttpPostAuthentication
+
 -->
 
 <auth-configuration>

Modified: nutch/trunk/src/plugin/protocol-httpclient/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/ivy.xml?rev=1659697&r1=1659696&r2=1659697&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/ivy.xml (original)
+++ nutch/trunk/src/plugin/protocol-httpclient/ivy.xml Fri Feb 13 22:05:47 2015
@@ -36,6 +36,7 @@
   </publications>
 
   <dependencies>
+    <dependency org="org.jsoup" name="jsoup" rev="1.8.1" />
   </dependencies>
   
 </ivy-module>

Modified: nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml?rev=1659697&r1=1659696&r2=1659697&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/plugin.xml (original)
+++ nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Fri Feb 13 22:05:47 2015
@@ -1,57 +1,58 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+   
+   http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
 -->
 <plugin
    id="protocol-httpclient"
    name="Http / Https Protocol Plug-in"
    version="1.0.0"
    provider-name="nutch.org">
-
+   
    <runtime>
       <library name="protocol-httpclient.jar">
          <export name="*"/>
       </library>
+      <library name="jsoup-1.8.1.jar"/>
    </runtime>
-
+   
    <requires>
       <import plugin="nutch-extensionpoints"/>
       <import plugin="lib-http"/>
    </requires>
-
+   
    <extension id="org.apache.nutch.protocol.httpclient"
-              name="HttpProtocol"
-              point="org.apache.nutch.protocol.Protocol">
-
+      name="HttpProtocol"
+      point="org.apache.nutch.protocol.Protocol">
+      
       <implementation id="org.apache.nutch.protocol.httpclient.Http"
-                      class="org.apache.nutch.protocol.httpclient.Http">
-        <parameter name="protocolName" value="http"/>
+         class="org.apache.nutch.protocol.httpclient.Http">
+         <parameter name="protocolName" value="http"/>
       </implementation>
-
+      
    </extension>
-
+   
    <extension id="org.apache.nutch.protocol.https"
-              name="HttpsProtocol"
-              point="org.apache.nutch.protocol.Protocol">
-
+      name="HttpsProtocol"
+      point="org.apache.nutch.protocol.Protocol">
+      
       <implementation id="org.apache.nutch.protocol.httpclient.Http"
-                      class="org.apache.nutch.protocol.httpclient.Http">
-        <parameter name="protocolName" value="https"/>
+         class="org.apache.nutch.protocol.httpclient.Http">
+         <parameter name="protocolName" value="https"/>
       </implementation>
-
+      
    </extension>
-
+   
 </plugin>

Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1659697&r1=1659696&r2=1659697&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original)
+++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Fri Feb 13 22:05:47 2015
@@ -21,8 +21,14 @@ import java.io.InputStream;
 import java.io.IOException;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
+
 import org.xml.sax.SAXException;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
@@ -43,8 +49,10 @@ import org.apache.commons.httpclient.aut
 import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
 import org.apache.commons.httpclient.protocol.Protocol;
 import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
-import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;
+// NUTCH-1929 Consider implementing dependency injection for crawl HTTPS sites that use self signed certificates
+//import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;
 
+import org.apache.commons.lang.StringUtils;
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.protocols.Response;
@@ -54,10 +62,15 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.nutch.util.NutchConfiguration;
 
 /**
- * This class is a protocol plugin that configures an HTTP client for Basic,
+ * <p>This class is a protocol plugin that configures an HTTP client for Basic,
  * Digest and NTLM authentication schemes for web server as well as proxy
  * server. It takes care of HTTPS protocol as well as cookies in a single fetch
- * session.
+ * session.</p>
+ * <p>Documentation can be found on the Nutch <a href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes">HttpAuthenticationSchemes</a>
+ * wiki page.</p>
+ * <p>The original description of the motivation to support <a href="https://wiki.apache.org/nutch/HttpPostAuthentication">HttpPostAuthentication</a>
+ * is also included on the Nutch wiki. Additionally HttpPostAuthentication development is documented
+ * at the <a href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira issue.
  * 
  * @author Susam Pal
  */
@@ -85,6 +98,8 @@ public class Http extends HttpBase {
   private String proxyPassword;
   private String proxyRealm;
 
+  private static HttpFormAuthConfigurer formConfigurer;
+
   /**
    * Returns the configured HTTP client.
    * 
@@ -163,7 +178,8 @@ public class Http extends HttpBase {
   private void configureClient() {
 
     // Set up an HTTPS socket factory that accepts self-signed certs.
-    ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
+    //ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
+    ProtocolSocketFactory factory = new DummySSLProtocolSocketFactory();
     Protocol https = new Protocol("https", factory, 443);
     Protocol.registerProtocol("https", https);
 
@@ -194,9 +210,9 @@ public class Http extends HttpBase {
     headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
     // prefer understandable formats
     headers
-        .add(new Header(
-            "Accept",
-            "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
+    .add(new Header(
+        "Accept",
+        "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
     // accept gzipped content
     headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
     hostConf.getParams().setParameter("http.default-headers", headers);
@@ -268,6 +284,14 @@ public class Http extends HttpBase {
           continue;
         }
 
+        String authMethod = credElement.getAttribute("authMethod");
+        // read http form post auth info
+        if (StringUtils.isNotBlank(authMethod)) {
+          formConfigurer = readFormAuthConfigurer(credElement,
+              authMethod);
+          continue;
+        }
+
         String username = credElement.getAttribute("username");
         String password = credElement.getAttribute("password");
 
@@ -337,6 +361,95 @@ public class Http extends HttpBase {
   }
 
   /**
+   * <auth-configuration> <credentials authMethod="formAuth"
+   * loginUrl="loginUrl" loginFormId="loginFormId" loginRedirect="true">
+   * <loginPostData> <field name="username" value="user1"/> </loginPostData>
+   * <additionalPostHeaders> <field name="header1" value="vaule1"/>
+   * </additionalPostHeaders> <removedFormFields> <field name="header1"/>
+   * </removedFormFields> </credentials> </auth-configuration>
+   */
+  private static HttpFormAuthConfigurer readFormAuthConfigurer(
+      Element credElement, String authMethod) {
+    if ("formAuth".equals(authMethod)) {
+      HttpFormAuthConfigurer formConfigurer = new HttpFormAuthConfigurer();
+
+      String str = credElement.getAttribute("loginUrl");
+      if (StringUtils.isNotBlank(str)) {
+        formConfigurer.setLoginUrl(str.trim());
+      } else {
+        throw new IllegalArgumentException("Must set loginUrl.");
+      }
+      str = credElement.getAttribute("loginFormId");
+      if (StringUtils.isNotBlank(str)) {
+        formConfigurer.setLoginFormId(str.trim());
+      } else {
+        throw new IllegalArgumentException("Must set loginFormId.");
+      }
+      str = credElement.getAttribute("loginRedirect");
+      if (StringUtils.isNotBlank(str)) {
+        formConfigurer.setLoginRedirect(Boolean.parseBoolean(str));
+      }
+
+      NodeList nodeList = credElement.getChildNodes();
+      for (int j = 0; j < nodeList.getLength(); j++) {
+        Node node = nodeList.item(j);
+        if (!(node instanceof Element))
+          continue;
+
+        Element element = (Element) node;
+        if ("loginPostData".equals(element.getTagName())) {
+          Map<String, String> loginPostData = new HashMap<String, String>();
+          NodeList childNodes = element.getChildNodes();
+          for (int k = 0; k < childNodes.getLength(); k++) {
+            Node fieldNode = childNodes.item(k);
+            if (!(fieldNode instanceof Element))
+              continue;
+
+            Element fieldElement = (Element) fieldNode;
+            String name = fieldElement.getAttribute("name");
+            String value = fieldElement.getAttribute("value");
+            loginPostData.put(name, value);
+          }
+          formConfigurer.setLoginPostData(loginPostData);
+        } else if ("additionalPostHeaders".equals(element.getTagName())) {
+          Map<String, String> additionalPostHeaders = new HashMap<String, String>();
+          NodeList childNodes = element.getChildNodes();
+          for (int k = 0; k < childNodes.getLength(); k++) {
+            Node fieldNode = childNodes.item(k);
+            if (!(fieldNode instanceof Element))
+              continue;
+
+            Element fieldElement = (Element) fieldNode;
+            String name = fieldElement.getAttribute("name");
+            String value = fieldElement.getAttribute("value");
+            additionalPostHeaders.put(name, value);
+          }
+          formConfigurer
+          .setAdditionalPostHeaders(additionalPostHeaders);
+        } else if ("removedFormFields".equals(element.getTagName())) {
+          Set<String> removedFormFields = new HashSet<String>();
+          NodeList childNodes = element.getChildNodes();
+          for (int k = 0; k < childNodes.getLength(); k++) {
+            Node fieldNode = childNodes.item(k);
+            if (!(fieldNode instanceof Element))
+              continue;
+
+            Element fieldElement = (Element) fieldNode;
+            String name = fieldElement.getAttribute("name");
+            removedFormFields.add(name);
+          }
+          formConfigurer.setRemovedFormFields(removedFormFields);
+        }
+      }
+
+      return formConfigurer;
+    } else {
+      throw new IllegalArgumentException("Unsupported authMethod: "
+          + authMethod);
+    }
+  }
+
+  /**
    * If credentials for the authentication scope determined from the specified
    * <code>url</code> is not already set in the HTTP client, then this method
    * sets the default credentials to fetch the specified <code>url</code>. If
@@ -348,6 +461,18 @@ public class Http extends HttpBase {
    */
   private void resolveCredentials(URL url) {
 
+    if (formConfigurer != null) {
+      HttpFormAuthentication formAuther = new HttpFormAuthentication(
+          formConfigurer, client, this);
+      try {
+        formAuther.login();
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+
+      return;
+    }
+
     if (defaultUsername != null && defaultUsername.length() > 0) {
 
       int port = url.getPort();