You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/02/13 23:20:16 UTC
svn commit: r1659701 - in
/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient:
HttpFormAuthConfigurer.java HttpFormAuthentication.java
Author: lewismc
Date: Fri Feb 13 22:20:15 2015
New Revision: 1659701
URL: http://svn.apache.org/r1659701
Log:
NUTCH-827 HTTP POST Authentication
Added:
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
Added: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java?rev=1659701&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java (added)
+++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java Fri Feb 13 22:20:15 2015
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+public class HttpFormAuthConfigurer {
+ private String loginUrl;
+ private String loginFormId;
+ /**
+ * The data posted to login form, such as username(or email), password
+ */
+ private Map<String, String> loginPostData;
+ /**
+ * In case we need add additional headers.
+ */
+ private Map<String, String> additionalPostHeaders;
+ /**
+ * If http post login returns redirect code: 301 or 302,
+ * Http Client will automatically follow the redirect.
+ */
+ private boolean loginRedirect;
+ /**
+ * Used when we need remove some form fields.
+ */
+ private Set<String> removedFormFields;
+
+ public HttpFormAuthConfigurer() {
+ }
+
+ public String getLoginUrl() {
+ return loginUrl;
+ }
+
+ public HttpFormAuthConfigurer setLoginUrl(String loginUrl) {
+ this.loginUrl = loginUrl;
+ return this;
+ }
+
+ public String getLoginFormId() {
+ return loginFormId;
+ }
+
+ public HttpFormAuthConfigurer setLoginFormId(String loginForm) {
+ this.loginFormId = loginForm;
+ return this;
+ }
+
+ public Map<String, String> getLoginPostData() {
+ return loginPostData == null ? new HashMap<String, String>()
+ : loginPostData;
+ }
+
+ public HttpFormAuthConfigurer setLoginPostData(
+ Map<String, String> loginPostData) {
+ this.loginPostData = loginPostData;
+ return this;
+ }
+
+ public Map<String, String> getAdditionalPostHeaders() {
+ return additionalPostHeaders == null ? new HashMap<String, String>()
+ : additionalPostHeaders;
+ }
+
+ public HttpFormAuthConfigurer setAdditionalPostHeaders(
+ Map<String, String> additionalPostHeaders) {
+ this.additionalPostHeaders = additionalPostHeaders;
+ return this;
+ }
+
+ public boolean isLoginRedirect() {
+ return loginRedirect;
+ }
+
+ public HttpFormAuthConfigurer setLoginRedirect(boolean redirect) {
+ this.loginRedirect = redirect;
+ return this;
+ }
+
+ public Set<String> getRemovedFormFields() {
+ return removedFormFields == null ? new HashSet<String>()
+ : removedFormFields;
+ }
+
+ public HttpFormAuthConfigurer setRemovedFormFields(
+ Set<String> removedFormFields) {
+ this.removedFormFields = removedFormFields;
+ return this; }
+}
Added: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java?rev=1659701&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java (added)
+++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java Fri Feb 13 22:20:15 2015
@@ -0,0 +1,223 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.CookieHandler;
+import java.net.CookieManager;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.httpclient.methods.PostMethod;
+import org.apache.commons.io.IOUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class HttpFormAuthentication {
+ private static final Logger LOGGER = LoggerFactory
+ .getLogger(HttpFormAuthentication.class);
+ private static Map<String, String> defaultLoginHeaders = new HashMap<String, String>();
+
+ static {
+ defaultLoginHeaders.put("User-Agent", "Mozilla/5.0");
+ defaultLoginHeaders
+ .put("Accept",
+ "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
+ defaultLoginHeaders.put("Accept-Language", "en-US,en;q=0.5");
+ defaultLoginHeaders.put("Connection", "keep-alive");
+ defaultLoginHeaders.put("Content-Type",
+ "application/x-www-form-urlencoded");
+ }
+
+ private HttpClient client;
+ private HttpFormAuthConfigurer authConfigurer = new HttpFormAuthConfigurer();
+ private String cookies;
+
+ public HttpFormAuthentication(HttpFormAuthConfigurer authConfigurer,
+ HttpClient client, Http http) {
+ this.authConfigurer = authConfigurer;
+ this.client = client;
+ defaultLoginHeaders.put("Accept", http.getAccept());
+ defaultLoginHeaders.put("Accept-Language", http.getAcceptLanguage());
+ defaultLoginHeaders.put("User-Agent", http.getUserAgent());
+ }
+
+ public HttpFormAuthentication(String loginUrl, String loginForm,
+ Map<String, String> loginPostData,
+ Map<String, String> additionalPostHeaders,
+ Set<String> removedFormFields) {
+ this.authConfigurer.setLoginUrl(loginUrl);
+ this.authConfigurer.setLoginFormId(loginForm);
+ this.authConfigurer
+ .setLoginPostData(loginPostData == null ? new HashMap<String, String>()
+ : loginPostData);
+ this.authConfigurer
+ .setAdditionalPostHeaders(additionalPostHeaders == null ? new HashMap<String, String>()
+ : additionalPostHeaders);
+ this.authConfigurer
+ .setRemovedFormFields(removedFormFields == null ? new HashSet<String>()
+ : removedFormFields);
+ this.client = new HttpClient();
+ }
+
+ public void login() throws Exception {
+ // make sure cookies are turned on
+ CookieHandler.setDefault(new CookieManager());
+ String pageContent = httpGetPageContent(authConfigurer.getLoginUrl());
+ List<NameValuePair> params = getLoginFormParams(pageContent);
+ sendPost(authConfigurer.getLoginUrl(), params);
+ }
+
+ private void sendPost(String url, List<NameValuePair> params)
+ throws Exception {
+ PostMethod post = null;
+ try {
+ if (authConfigurer.isLoginRedirect()) {
+ post = new PostMethod(url) {
+ @Override
+ public boolean getFollowRedirects() {
+ return true;
+ }
+ };
+ } else {
+ post = new PostMethod(url);
+ }
+ // we can't use post.setFollowRedirects(true) as it will throw
+ // IllegalArgumentException:
+ // Entity enclosing requests cannot be redirected without user
+ // intervention
+ setLoginHeader(post);
+ post.addParameters(params.toArray(new NameValuePair[0]));
+ int rspCode = client.executeMethod(post);
+ if (LOGGER.isDebugEnabled()) {
+ LOGGER.debug("rspCode: " + rspCode);
+ LOGGER.debug("\nSending 'POST' request to URL : " + url);
+
+ LOGGER.debug("Post parameters : " + params);
+ LOGGER.debug("Response Code : " + rspCode);
+ for (Header header : post.getRequestHeaders()) {
+ LOGGER.debug("Response headers : " + header);
+ }
+ }
+ String rst = IOUtils.toString(post.getResponseBodyAsStream());
+ LOGGER.debug("login post result: " + rst);
+ } finally {
+ if (post != null) {
+ post.releaseConnection();
+ }
+ }
+ }
+
+ private void setLoginHeader(PostMethod post) {
+ Map<String, String> headers = new HashMap<String, String>();
+ headers.putAll(defaultLoginHeaders);
+ // additionalPostHeaders can overwrite value in defaultLoginHeaders
+ headers.putAll(authConfigurer.getAdditionalPostHeaders());
+ for (Entry<String, String> entry : headers.entrySet()) {
+ post.addRequestHeader(entry.getKey(), entry.getValue());
+ }
+ post.addRequestHeader("Cookie", getCookies());
+ }
+
+ private String httpGetPageContent(String url) throws IOException {
+
+ GetMethod get = new GetMethod(url);
+ try {
+ for (Entry<String, String> entry : authConfigurer
+ .getAdditionalPostHeaders().entrySet()) {
+ get.addRequestHeader(entry.getKey(), entry.getValue());
+ }
+ client.executeMethod(get);
+ Header cookieHeader = get.getResponseHeader("Set-Cookie");
+ if (cookieHeader != null) {
+ setCookies(cookieHeader.getValue());
+ }
+ String rst = IOUtils.toString(get.getResponseBodyAsStream());
+ return rst;
+ } finally {
+ get.releaseConnection();
+ }
+
+ }
+
+ private List<NameValuePair> getLoginFormParams(String pageContent)
+ throws UnsupportedEncodingException {
+ List<NameValuePair> params = new ArrayList<NameValuePair>();
+ Document doc = Jsoup.parse(pageContent);
+ Element loginform = doc.getElementById(authConfigurer.getLoginFormId());
+ if (loginform == null) {
+ LOGGER.debug("No form element found with 'id' = {}, trying 'name'.",
+ authConfigurer.getLoginFormId());
+ loginform = doc.select("form[name="+ authConfigurer.getLoginFormId() + "]").first();
+ if (loginform == null) {
+ LOGGER.debug("No form element found with 'name' = {}",
+ authConfigurer.getLoginFormId());
+ throw new IllegalArgumentException("No form exists: "
+ + authConfigurer.getLoginFormId());
+ }
+ }
+ Elements inputElements = loginform.getElementsByTag("input");
+ // skip fields in removedFormFields or loginPostData
+ for (Element inputElement : inputElements) {
+ String key = inputElement.attr("name");
+ String value = inputElement.attr("value");
+ if (authConfigurer.getLoginPostData().containsKey(key)
+ || authConfigurer.getRemovedFormFields().contains(key)) {
+ // value = loginPostData.get(key);
+ continue;
+ }
+ params.add(new NameValuePair(key, value));
+ }
+ // add key and value in loginPostData
+ for (Entry<String, String> entry : authConfigurer.getLoginPostData()
+ .entrySet()) {
+ params.add(new NameValuePair(entry.getKey(), entry.getValue()));
+ }
+ return params;
+ }
+
+ public String getCookies() {
+ return cookies;
+ }
+
+ public void setCookies(String cookies) {
+ this.cookies = cookies;
+ }
+
+ public boolean isRedirect() {
+ return authConfigurer.isLoginRedirect();
+ }
+
+ public void setRedirect(boolean redirect) {
+ this.authConfigurer.setLoginRedirect(redirect);
+ }
+
+}