You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/02/13 23:20:16 UTC

svn commit: r1659701 - in /nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient: HttpFormAuthConfigurer.java HttpFormAuthentication.java

Author: lewismc
Date: Fri Feb 13 22:20:15 2015
New Revision: 1659701

URL: http://svn.apache.org/r1659701
Log:
NUTCH-827 HTTP POST Authentication

Added:
    nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
    nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java

Added: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java?rev=1659701&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java (added)
+++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java Fri Feb 13 22:20:15 2015
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+public class HttpFormAuthConfigurer {
+  private String loginUrl;
+  private String loginFormId;
+  /**
+   * The data posted to login form, such as username(or email), password
+   */
+  private Map<String, String> loginPostData;
+  /**
+   * In case we need add additional headers.
+   */
+  private Map<String, String> additionalPostHeaders;
+  /**
+   * If http post login returns redirect code: 301 or 302, 
+   * Http Client will automatically follow the redirect.
+   */
+  private boolean loginRedirect;
+  /**
+   * Used when we need remove some form fields.
+   */
+  private Set<String> removedFormFields;
+
+  public HttpFormAuthConfigurer() {
+  }
+
+  public String getLoginUrl() {
+    return loginUrl;
+  }
+
+  public HttpFormAuthConfigurer setLoginUrl(String loginUrl) {
+    this.loginUrl = loginUrl;
+    return this;
+  }
+
+  public String getLoginFormId() {
+    return loginFormId;
+  }
+
+  public HttpFormAuthConfigurer setLoginFormId(String loginForm) {
+    this.loginFormId = loginForm;
+    return this;
+  }
+
+  public Map<String, String> getLoginPostData() {
+    return loginPostData == null ? new HashMap<String, String>()
+        : loginPostData;
+  }
+
+  public HttpFormAuthConfigurer setLoginPostData(
+      Map<String, String> loginPostData) {
+    this.loginPostData = loginPostData;
+    return this;
+  }
+
+  public Map<String, String> getAdditionalPostHeaders() {
+    return additionalPostHeaders == null ? new HashMap<String, String>()
+        : additionalPostHeaders;
+  }
+
+  public HttpFormAuthConfigurer setAdditionalPostHeaders(
+      Map<String, String> additionalPostHeaders) {
+    this.additionalPostHeaders = additionalPostHeaders;
+    return this;
+  }
+
+  public boolean isLoginRedirect() {
+    return loginRedirect;
+  }
+
+  public HttpFormAuthConfigurer setLoginRedirect(boolean redirect) {
+    this.loginRedirect = redirect;
+    return this;
+  }
+
+  public Set<String> getRemovedFormFields() {
+    return removedFormFields == null ? new HashSet<String>()
+        : removedFormFields;
+  }
+
+  public HttpFormAuthConfigurer setRemovedFormFields(
+      Set<String> removedFormFields) {
+    this.removedFormFields = removedFormFields;
+    return this; }
+}

Added: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java?rev=1659701&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java (added)
+++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java Fri Feb 13 22:20:15 2015
@@ -0,0 +1,223 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.CookieHandler;
+import java.net.CookieManager;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.httpclient.methods.PostMethod;
+import org.apache.commons.io.IOUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class HttpFormAuthentication {
+  private static final Logger LOGGER = LoggerFactory
+      .getLogger(HttpFormAuthentication.class);
+  private static Map<String, String> defaultLoginHeaders = new HashMap<String, String>();
+
+  static {
+    defaultLoginHeaders.put("User-Agent", "Mozilla/5.0");
+    defaultLoginHeaders
+    .put("Accept",
+        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
+    defaultLoginHeaders.put("Accept-Language", "en-US,en;q=0.5");
+    defaultLoginHeaders.put("Connection", "keep-alive");
+    defaultLoginHeaders.put("Content-Type",
+        "application/x-www-form-urlencoded");
+  }
+
+  private HttpClient client;
+  private HttpFormAuthConfigurer authConfigurer = new HttpFormAuthConfigurer();
+  private String cookies;
+
+  public HttpFormAuthentication(HttpFormAuthConfigurer authConfigurer,
+      HttpClient client, Http http) {
+    this.authConfigurer = authConfigurer;
+    this.client = client;
+    defaultLoginHeaders.put("Accept", http.getAccept());
+    defaultLoginHeaders.put("Accept-Language", http.getAcceptLanguage());
+    defaultLoginHeaders.put("User-Agent", http.getUserAgent());
+  }
+
+  public HttpFormAuthentication(String loginUrl, String loginForm,
+      Map<String, String> loginPostData,
+      Map<String, String> additionalPostHeaders,
+      Set<String> removedFormFields) {
+    this.authConfigurer.setLoginUrl(loginUrl);
+    this.authConfigurer.setLoginFormId(loginForm);
+    this.authConfigurer
+    .setLoginPostData(loginPostData == null ? new HashMap<String, String>()
+        : loginPostData);
+    this.authConfigurer
+    .setAdditionalPostHeaders(additionalPostHeaders == null ? new HashMap<String, String>()
+        : additionalPostHeaders);
+    this.authConfigurer
+    .setRemovedFormFields(removedFormFields == null ? new HashSet<String>()
+        : removedFormFields);
+    this.client = new HttpClient();
+  }
+
+  public void login() throws Exception {
+    // make sure cookies are turned on
+    CookieHandler.setDefault(new CookieManager());
+    String pageContent = httpGetPageContent(authConfigurer.getLoginUrl());
+    List<NameValuePair> params = getLoginFormParams(pageContent);
+    sendPost(authConfigurer.getLoginUrl(), params);
+  }
+
+  private void sendPost(String url, List<NameValuePair> params)
+      throws Exception {
+    PostMethod post = null;
+    try {
+      if (authConfigurer.isLoginRedirect()) {
+        post = new PostMethod(url) {
+          @Override
+          public boolean getFollowRedirects() {
+            return true;
+          }
+        };
+      } else {
+        post = new PostMethod(url);
+      }
+      // we can't use post.setFollowRedirects(true) as it will throw
+      // IllegalArgumentException:
+      // Entity enclosing requests cannot be redirected without user
+      // intervention
+      setLoginHeader(post);
+      post.addParameters(params.toArray(new NameValuePair[0]));
+      int rspCode = client.executeMethod(post);
+      if (LOGGER.isDebugEnabled()) {
+        LOGGER.debug("rspCode: " + rspCode);
+        LOGGER.debug("\nSending 'POST' request to URL : " + url);
+
+        LOGGER.debug("Post parameters : " + params);
+        LOGGER.debug("Response Code : " + rspCode);
+        for (Header header : post.getRequestHeaders()) {
+          LOGGER.debug("Response headers : " + header);
+        }
+      }
+      String rst = IOUtils.toString(post.getResponseBodyAsStream());
+      LOGGER.debug("login post result: " + rst);
+    } finally {
+      if (post != null) {
+        post.releaseConnection();
+      }
+    }
+  }
+
+  private void setLoginHeader(PostMethod post) {
+    Map<String, String> headers = new HashMap<String, String>();
+    headers.putAll(defaultLoginHeaders);
+    // additionalPostHeaders can overwrite value in defaultLoginHeaders
+    headers.putAll(authConfigurer.getAdditionalPostHeaders());
+    for (Entry<String, String> entry : headers.entrySet()) {
+      post.addRequestHeader(entry.getKey(), entry.getValue());
+    }
+    post.addRequestHeader("Cookie", getCookies());
+  }
+
+  private String httpGetPageContent(String url) throws IOException {
+
+    GetMethod get = new GetMethod(url);
+    try {
+      for (Entry<String, String> entry : authConfigurer
+          .getAdditionalPostHeaders().entrySet()) {
+        get.addRequestHeader(entry.getKey(), entry.getValue());
+      }
+      client.executeMethod(get);
+      Header cookieHeader = get.getResponseHeader("Set-Cookie");
+      if (cookieHeader != null) {
+        setCookies(cookieHeader.getValue());
+      }
+      String rst = IOUtils.toString(get.getResponseBodyAsStream());
+      return rst;
+    } finally {
+      get.releaseConnection();
+    }
+
+  }
+
+  private List<NameValuePair> getLoginFormParams(String pageContent)
+      throws UnsupportedEncodingException {
+    List<NameValuePair> params = new ArrayList<NameValuePair>();
+    Document doc = Jsoup.parse(pageContent);
+    Element loginform = doc.getElementById(authConfigurer.getLoginFormId());
+    if (loginform == null) {
+      LOGGER.debug("No form element found with 'id' = {}, trying 'name'.",
+          authConfigurer.getLoginFormId());
+      loginform = doc.select("form[name="+ authConfigurer.getLoginFormId() + "]").first();
+      if (loginform == null) {
+        LOGGER.debug("No form element found with 'name' = {}",
+            authConfigurer.getLoginFormId());
+        throw new IllegalArgumentException("No form exists: "
+            + authConfigurer.getLoginFormId());
+      }
+    }
+    Elements inputElements = loginform.getElementsByTag("input");
+    // skip fields in removedFormFields or loginPostData
+    for (Element inputElement : inputElements) {
+      String key = inputElement.attr("name");
+      String value = inputElement.attr("value");
+      if (authConfigurer.getLoginPostData().containsKey(key)
+          || authConfigurer.getRemovedFormFields().contains(key)) {
+        // value = loginPostData.get(key);
+        continue;
+      }
+      params.add(new NameValuePair(key, value));
+    }
+    // add key and value in loginPostData
+    for (Entry<String, String> entry : authConfigurer.getLoginPostData()
+        .entrySet()) {
+      params.add(new NameValuePair(entry.getKey(), entry.getValue()));
+    }
+    return params;
+  }
+
+  public String getCookies() {
+    return cookies;
+  }
+
+  public void setCookies(String cookies) {
+    this.cookies = cookies;
+  }
+
+  public boolean isRedirect() {
+    return authConfigurer.isLoginRedirect();
+  }
+
+  public void setRedirect(boolean redirect) {
+    this.authConfigurer.setLoginRedirect(redirect);
+  }
+
+}