You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/02/13 23:05:48 UTC
svn commit: r1659697 - in /nutch/trunk: CHANGES.txt
conf/httpclient-auth.xml.template src/plugin/protocol-httpclient/ivy.xml
src/plugin/protocol-httpclient/plugin.xml
src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
Author: lewismc
Date: Fri Feb 13 22:05:47 2015
New Revision: 1659697
URL: http://svn.apache.org/r1659697
Log:
NUTCH-827 HTTP POST Authentication
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/httpclient-auth.xml.template
nutch/trunk/src/plugin/protocol-httpclient/ivy.xml
nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659697&r1=1659696&r2=1659697&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Feb 13 22:05:47 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn, snagel, lewismc)
+
* NUTCH-1724 LinkDBReader to support regex output filtering (markus)
* NUTCH-1939 Fetcher fails to follow redirects (Leo Ye via snagel)
Modified: nutch/trunk/conf/httpclient-auth.xml.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/httpclient-auth.xml.template?rev=1659697&r1=1659696&r2=1659697&view=diff
==============================================================================
--- nutch/trunk/conf/httpclient-auth.xml.template (original)
+++ nutch/trunk/conf/httpclient-auth.xml.template Fri Feb 13 22:05:47 2015
@@ -54,6 +54,54 @@
NTLM does not use the notion of realms. The domain name may be
specified as the value for 'realm' attribute in case of NTLM.
+
+ More information on Basic, Digest and NTLM authentication
+ support can be located at https://wiki.apache.org/nutch/HttpAuthenticationSchemes
+
+ HTTP-POST Authentication Support
+ Http Form-based Authentication is a very common used authentication
+ mechanism to protect web resources. We extend the 'auth-configuration'
+ to include information about http form authentication properties as shown
+ in the following example:
+
+ Example:-
+ <credentials authMethod="formAuth"
+ loginUrl="http://localhost:44444/Account/Login.aspx"
+ loginFormId="ctl01"
+ loginRedirect="true">
+ <loginPostData>
+ <field name="ctl00$MainContent$LoginUser$UserName"
+ value="admin"/>
+ <field name="ctl00$MainContent$LoginUser$Password"
+ value="admin123"/>
+ </loginPostData>
+ <additionalPostHeaders>
+ <field name="User-Agent"
+ value="Mozilla/5.0 ... Firefox/35.0" />
+ </additionalPostHeaders>
+ <removedFormFields>
+ <field name="ctl00$MainContent$LoginUser$RememberMe"/>
+ </removedFormFields>
+ </credentials>
+
+ it is critical that the following fields are substituted:
+ * loginUrl - the URL containing the actual <form>
+ * loginFormId - the <form id="$formId" attribute value
+ (or the 'name' attribute if no form is referenced by 'id' attribute)
+ * loginRedirect - if http post login returns redirect code: 301 or 302,
+ and value is true, Http Client will automatically follow the redirect.
+ * <field name="ctl00$MainContent$LoginUser$UserName" value="admin"
+ - the <input name"name" and user defined username value used to represent
+ the field and username respectively
+ * <field name="ctl00$MainContent$LoginUser$Password" value="admin123"
+ - the <input name"name" and user defined password value used to represent
+ the field and password respectively
+ * <field name="ctl00$MainContent$LoginUser$RememberMe"/>
+ - form element attributes for which we wish to skip fields
+
+ More information on HTTP POST can be located at
+ https://wiki.apache.org/nutch/HttpPostAuthentication
+
-->
<auth-configuration>
Modified: nutch/trunk/src/plugin/protocol-httpclient/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/ivy.xml?rev=1659697&r1=1659696&r2=1659697&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/ivy.xml (original)
+++ nutch/trunk/src/plugin/protocol-httpclient/ivy.xml Fri Feb 13 22:05:47 2015
@@ -36,6 +36,7 @@
</publications>
<dependencies>
+ <dependency org="org.jsoup" name="jsoup" rev="1.8.1" />
</dependencies>
</ivy-module>
Modified: nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml?rev=1659697&r1=1659696&r2=1659697&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/plugin.xml (original)
+++ nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Fri Feb 13 22:05:47 2015
@@ -1,57 +1,58 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
-->
<plugin
id="protocol-httpclient"
name="Http / Https Protocol Plug-in"
version="1.0.0"
provider-name="nutch.org">
-
+
<runtime>
<library name="protocol-httpclient.jar">
<export name="*"/>
</library>
+ <library name="jsoup-1.8.1.jar"/>
</runtime>
-
+
<requires>
<import plugin="nutch-extensionpoints"/>
<import plugin="lib-http"/>
</requires>
-
+
<extension id="org.apache.nutch.protocol.httpclient"
- name="HttpProtocol"
- point="org.apache.nutch.protocol.Protocol">
-
+ name="HttpProtocol"
+ point="org.apache.nutch.protocol.Protocol">
+
<implementation id="org.apache.nutch.protocol.httpclient.Http"
- class="org.apache.nutch.protocol.httpclient.Http">
- <parameter name="protocolName" value="http"/>
+ class="org.apache.nutch.protocol.httpclient.Http">
+ <parameter name="protocolName" value="http"/>
</implementation>
-
+
</extension>
-
+
<extension id="org.apache.nutch.protocol.https"
- name="HttpsProtocol"
- point="org.apache.nutch.protocol.Protocol">
-
+ name="HttpsProtocol"
+ point="org.apache.nutch.protocol.Protocol">
+
<implementation id="org.apache.nutch.protocol.httpclient.Http"
- class="org.apache.nutch.protocol.httpclient.Http">
- <parameter name="protocolName" value="https"/>
+ class="org.apache.nutch.protocol.httpclient.Http">
+ <parameter name="protocolName" value="https"/>
</implementation>
-
+
</extension>
-
+
</plugin>
Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1659697&r1=1659696&r2=1659697&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original)
+++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Fri Feb 13 22:05:47 2015
@@ -21,8 +21,14 @@ import java.io.InputStream;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
+
import org.xml.sax.SAXException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
@@ -43,8 +49,10 @@ import org.apache.commons.httpclient.aut
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
-import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;
+// NUTCH-1929 Consider implementing dependency injection for crawl HTTPS sites that use self signed certificates
+//import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;
+import org.apache.commons.lang.StringUtils;
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.protocols.Response;
@@ -54,10 +62,15 @@ import org.apache.hadoop.conf.Configurat
import org.apache.nutch.util.NutchConfiguration;
/**
- * This class is a protocol plugin that configures an HTTP client for Basic,
+ * <p>This class is a protocol plugin that configures an HTTP client for Basic,
* Digest and NTLM authentication schemes for web server as well as proxy
* server. It takes care of HTTPS protocol as well as cookies in a single fetch
- * session.
+ * session.</p>
+ * <p>Documentation can be found on the Nutch <a href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes">HttpAuthenticationSchemes</a>
+ * wiki page.</p>
+ * <p>The original description of the motivation to support <a href="https://wiki.apache.org/nutch/HttpPostAuthentication">HttpPostAuthentication</a>
+ * is also included on the Nutch wiki. Additionally HttpPostAuthentication development is documented
+ * at the <a href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira issue.
*
* @author Susam Pal
*/
@@ -85,6 +98,8 @@ public class Http extends HttpBase {
private String proxyPassword;
private String proxyRealm;
+ private static HttpFormAuthConfigurer formConfigurer;
+
/**
* Returns the configured HTTP client.
*
@@ -163,7 +178,8 @@ public class Http extends HttpBase {
private void configureClient() {
// Set up an HTTPS socket factory that accepts self-signed certs.
- ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
+ //ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
+ ProtocolSocketFactory factory = new DummySSLProtocolSocketFactory();
Protocol https = new Protocol("https", factory, 443);
Protocol.registerProtocol("https", https);
@@ -194,9 +210,9 @@ public class Http extends HttpBase {
headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
// prefer understandable formats
headers
- .add(new Header(
- "Accept",
- "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
+ .add(new Header(
+ "Accept",
+ "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
// accept gzipped content
headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
hostConf.getParams().setParameter("http.default-headers", headers);
@@ -268,6 +284,14 @@ public class Http extends HttpBase {
continue;
}
+ String authMethod = credElement.getAttribute("authMethod");
+ // read http form post auth info
+ if (StringUtils.isNotBlank(authMethod)) {
+ formConfigurer = readFormAuthConfigurer(credElement,
+ authMethod);
+ continue;
+ }
+
String username = credElement.getAttribute("username");
String password = credElement.getAttribute("password");
@@ -337,6 +361,95 @@ public class Http extends HttpBase {
}
/**
+ * <auth-configuration> <credentials authMethod="formAuth"
+ * loginUrl="loginUrl" loginFormId="loginFormId" loginRedirect="true">
+ * <loginPostData> <field name="username" value="user1"/> </loginPostData>
+ * <additionalPostHeaders> <field name="header1" value="vaule1"/>
+ * </additionalPostHeaders> <removedFormFields> <field name="header1"/>
+ * </removedFormFields> </credentials> </auth-configuration>
+ */
+ private static HttpFormAuthConfigurer readFormAuthConfigurer(
+ Element credElement, String authMethod) {
+ if ("formAuth".equals(authMethod)) {
+ HttpFormAuthConfigurer formConfigurer = new HttpFormAuthConfigurer();
+
+ String str = credElement.getAttribute("loginUrl");
+ if (StringUtils.isNotBlank(str)) {
+ formConfigurer.setLoginUrl(str.trim());
+ } else {
+ throw new IllegalArgumentException("Must set loginUrl.");
+ }
+ str = credElement.getAttribute("loginFormId");
+ if (StringUtils.isNotBlank(str)) {
+ formConfigurer.setLoginFormId(str.trim());
+ } else {
+ throw new IllegalArgumentException("Must set loginFormId.");
+ }
+ str = credElement.getAttribute("loginRedirect");
+ if (StringUtils.isNotBlank(str)) {
+ formConfigurer.setLoginRedirect(Boolean.parseBoolean(str));
+ }
+
+ NodeList nodeList = credElement.getChildNodes();
+ for (int j = 0; j < nodeList.getLength(); j++) {
+ Node node = nodeList.item(j);
+ if (!(node instanceof Element))
+ continue;
+
+ Element element = (Element) node;
+ if ("loginPostData".equals(element.getTagName())) {
+ Map<String, String> loginPostData = new HashMap<String, String>();
+ NodeList childNodes = element.getChildNodes();
+ for (int k = 0; k < childNodes.getLength(); k++) {
+ Node fieldNode = childNodes.item(k);
+ if (!(fieldNode instanceof Element))
+ continue;
+
+ Element fieldElement = (Element) fieldNode;
+ String name = fieldElement.getAttribute("name");
+ String value = fieldElement.getAttribute("value");
+ loginPostData.put(name, value);
+ }
+ formConfigurer.setLoginPostData(loginPostData);
+ } else if ("additionalPostHeaders".equals(element.getTagName())) {
+ Map<String, String> additionalPostHeaders = new HashMap<String, String>();
+ NodeList childNodes = element.getChildNodes();
+ for (int k = 0; k < childNodes.getLength(); k++) {
+ Node fieldNode = childNodes.item(k);
+ if (!(fieldNode instanceof Element))
+ continue;
+
+ Element fieldElement = (Element) fieldNode;
+ String name = fieldElement.getAttribute("name");
+ String value = fieldElement.getAttribute("value");
+ additionalPostHeaders.put(name, value);
+ }
+ formConfigurer
+ .setAdditionalPostHeaders(additionalPostHeaders);
+ } else if ("removedFormFields".equals(element.getTagName())) {
+ Set<String> removedFormFields = new HashSet<String>();
+ NodeList childNodes = element.getChildNodes();
+ for (int k = 0; k < childNodes.getLength(); k++) {
+ Node fieldNode = childNodes.item(k);
+ if (!(fieldNode instanceof Element))
+ continue;
+
+ Element fieldElement = (Element) fieldNode;
+ String name = fieldElement.getAttribute("name");
+ removedFormFields.add(name);
+ }
+ formConfigurer.setRemovedFormFields(removedFormFields);
+ }
+ }
+
+ return formConfigurer;
+ } else {
+ throw new IllegalArgumentException("Unsupported authMethod: "
+ + authMethod);
+ }
+ }
+
+ /**
* If credentials for the authentication scope determined from the specified
* <code>url</code> is not already set in the HTTP client, then this method
* sets the default credentials to fetch the specified <code>url</code>. If
@@ -348,6 +461,18 @@ public class Http extends HttpBase {
*/
private void resolveCredentials(URL url) {
+ if (formConfigurer != null) {
+ HttpFormAuthentication formAuther = new HttpFormAuthentication(
+ formConfigurer, client, this);
+ try {
+ formAuther.login();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+
+ return;
+ }
+
if (defaultUsername != null && defaultUsername.length() > 0) {
int port = url.getPort();