You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:24 UTC
[40/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java
new file mode 100644
index 0000000..9f616fe
--- /dev/null
+++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -0,0 +1,587 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http.api;
+
+// JDK imports
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.net.URL;
+import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.ThreadLocalRandom;
+// Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.util.GZIPUtils;
+import org.apache.nutch.util.DeflateUtils;
+import org.apache.hadoop.util.StringUtils;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+
+// crawler-commons imports
+import crawlercommons.robots.BaseRobotRules;
+
+public abstract class HttpBase implements Protocol {
+
+ public static final Text RESPONSE_TIME = new Text("_rs_");
+
+ public static final int BUFFER_SIZE = 8 * 1024;
+
+ private static final byte[] EMPTY_CONTENT = new byte[0];
+
+ private HttpRobotRulesParser robots = null;
+
+ private ArrayList<String> userAgentNames = null;
+
+ /** The proxy hostname. */
+ protected String proxyHost = null;
+
+ /** The proxy port. */
+ protected int proxyPort = 8080;
+
+ /** The proxy exception list. */
+ protected HashMap proxyException = new HashMap();
+
+ /** Indicates if a proxy is used */
+ protected boolean useProxy = false;
+
+ /** The network timeout in millisecond */
+ protected int timeout = 10000;
+
+ /** The length limit for downloaded content, in bytes. */
+ protected int maxContent = 64 * 1024;
+
+ /** The Nutch 'User-Agent' request header */
+ protected String userAgent = getAgentString("NutchCVS", null, "Nutch",
+ "http://nutch.apache.org/bot.html", "agent@nutch.apache.org");
+
+ /** The "Accept-Language" request header value. */
+ protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
+
+ /** The "Accept" request header value. */
+ protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
+
+ /** The default logger */
+ private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
+
+ /** The specified logger */
+ private Logger logger = LOGGER;
+
+ /** The nutch configuration */
+ private Configuration conf = null;
+
+ /** Do we use HTTP/1.1? */
+ protected boolean useHttp11 = false;
+
+ /**
+ * Record response time in CrawlDatum's meta data, see property
+ * http.store.responsetime.
+ */
+ protected boolean responseTime = true;
+
+ /** Skip page if Crawl-Delay longer than this value. */
+ protected long maxCrawlDelay = -1L;
+
+ /** Which TLS/SSL protocols to support */
+ protected Set<String> tlsPreferredProtocols;
+
+ /** Which TLS/SSL cipher suites to support */
+ protected Set<String> tlsPreferredCipherSuites;
+
+ /** Configuration directive for If-Modified-Since HTTP header */
+ public boolean enableIfModifiedsinceHeader = true;
+
+ /** Creates a new instance of HttpBase */
+ public HttpBase() {
+ this(null);
+ }
+
+ /** Creates a new instance of HttpBase */
+ public HttpBase(Logger logger) {
+ if (logger != null) {
+ this.logger = logger;
+ }
+ robots = new HttpRobotRulesParser();
+ }
+
+ // Inherited Javadoc
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ this.proxyHost = conf.get("http.proxy.host");
+ this.proxyPort = conf.getInt("http.proxy.port", 8080);
+ this.proxyException = arrayToMap(conf.getStrings("http.proxy.exception.list"));
+ this.useProxy = (proxyHost != null && proxyHost.length() > 0);
+ this.timeout = conf.getInt("http.timeout", 10000);
+ this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
+ this.userAgent = getAgentString(conf.get("http.agent.name"),
+ conf.get("http.agent.version"), conf.get("http.agent.description"),
+ conf.get("http.agent.url"), conf.get("http.agent.email"));
+ this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
+ this.accept = conf.get("http.accept", accept);
+ // backward-compatible default setting
+ this.useHttp11 = conf.getBoolean("http.useHttp11", false);
+ this.responseTime = conf.getBoolean("http.store.responsetime", true);
+ this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
+ this.robots.setConf(conf);
+
+ // NUTCH-1941: read list of alternating agent names
+ if (conf.getBoolean("http.agent.rotate", false)) {
+ String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
+ BufferedReader br = null;
+ try {
+ Reader reader = conf.getConfResourceAsReader(agentsFile);
+ br = new BufferedReader(reader);
+ userAgentNames = new ArrayList<String>();
+ String word = "";
+ while ((word = br.readLine()) != null) {
+ if (!word.trim().isEmpty())
+ userAgentNames.add(word.trim());
+ }
+
+ if (userAgentNames.size() == 0) {
+ logger.warn("Empty list of user agents in http.agent.rotate.file {}",
+ agentsFile);
+ userAgentNames = null;
+ }
+
+ } catch (Exception e) {
+ logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile,
+ StringUtils.stringifyException(e));
+ userAgentNames = null;
+ } finally {
+ if (br != null) {
+ try {
+ br.close();
+ } catch (IOException e) {
+ // ignore
+ }
+ }
+ }
+ if (userAgentNames == null) {
+ logger
+ .warn("Falling back to fixed user agent set via property http.agent.name");
+ }
+ }
+
+ String[] protocols = conf.getStrings("http.tls.supported.protocols",
+ "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
+ String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
+ "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
+ "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
+ "TLS_RSA_WITH_AES_256_CBC_SHA256",
+ "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
+ "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
+ "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
+ "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256",
+ "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
+ "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
+ "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
+ "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA",
+ "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
+ "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
+ "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
+ "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA",
+ "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
+ "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA",
+ "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA",
+ "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
+ "TLS_ECDH_RSA_WITH_RC4_128_SHA",
+ "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA",
+ "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA",
+ "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
+ "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
+ "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA",
+ "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5",
+ "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256",
+ "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA",
+ "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA",
+ "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5",
+ "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA",
+ "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
+ "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA",
+ "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA",
+ "TLS_KRB5_WITH_DES_CBC_MD5");
+
+ tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
+ tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
+
+ logConf();
+ }
+
+ // Inherited Javadoc
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
+
+ String urlString = url.toString();
+ try {
+ URL u = new URL(urlString);
+
+ long startTime = System.currentTimeMillis();
+ Response response = getResponse(u, datum, false); // make a request
+
+ if (this.responseTime) {
+ int elapsedTime = (int) (System.currentTimeMillis() - startTime);
+ datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
+ }
+
+ int code = response.getCode();
+ datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
+ new Text(Integer.toString(code)));
+
+ byte[] content = response.getContent();
+ Content c = new Content(u.toString(), u.toString(),
+ (content == null ? EMPTY_CONTENT : content),
+ response.getHeader("Content-Type"), response.getHeaders(), this.conf);
+
+ if (code == 200) { // got a good response
+ return new ProtocolOutput(c); // return it
+
+ } else if (code >= 300 && code < 400) { // handle redirect
+ String location = response.getHeader("Location");
+ // some broken servers, such as MS IIS, use lowercase header name...
+ if (location == null)
+ location = response.getHeader("location");
+ if (location == null)
+ location = "";
+ u = new URL(u, location);
+ int protocolStatusCode;
+ switch (code) {
+ case 300: // multiple choices, preferred value in Location
+ protocolStatusCode = ProtocolStatus.MOVED;
+ break;
+ case 301: // moved permanently
+ case 305: // use proxy (Location is URL of proxy)
+ protocolStatusCode = ProtocolStatus.MOVED;
+ break;
+ case 302: // found (temporarily moved)
+ case 303: // see other (redirect after POST)
+ case 307: // temporary redirect
+ protocolStatusCode = ProtocolStatus.TEMP_MOVED;
+ break;
+ case 304: // not modified
+ protocolStatusCode = ProtocolStatus.NOTMODIFIED;
+ break;
+ default:
+ protocolStatusCode = ProtocolStatus.MOVED;
+ }
+ // handle this in the higher layer.
+ return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
+ } else if (code == 400) { // bad request, mark as GONE
+ if (logger.isTraceEnabled()) {
+ logger.trace("400 Bad request: " + u);
+ }
+ return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
+ } else if (code == 401) { // requires authorization, but no valid auth
+ // provided.
+ if (logger.isTraceEnabled()) {
+ logger.trace("401 Authentication Required");
+ }
+ return new ProtocolOutput(c, new ProtocolStatus(
+ ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+ + urlString));
+ } else if (code == 404) {
+ return new ProtocolOutput(c, new ProtocolStatus(
+ ProtocolStatus.NOTFOUND, u));
+ } else if (code == 410) { // permanently GONE
+ return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
+ "Http: " + code + " url=" + u));
+ } else {
+ return new ProtocolOutput(c, new ProtocolStatus(
+ ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
+ }
+ } catch (Throwable e) {
+ logger.error("Failed to get protocol output", e);
+ return new ProtocolOutput(null, new ProtocolStatus(e));
+ }
+ }
+
+ /*
+ * -------------------------- * </implementation:Protocol> *
+ * --------------------------
+ */
+
+ public String getProxyHost() {
+ return proxyHost;
+ }
+
+ public int getProxyPort() {
+ return proxyPort;
+ }
+
+ public boolean useProxy(URL url) {
+ if (!useProxy){
+ return false;
+ } else if (proxyException.get(url.getHost())!=null){
+ return false;
+ }
+ return useProxy;
+ }
+
+ public int getTimeout() {
+ return timeout;
+ }
+
+ public boolean isIfModifiedSinceEnabled() {
+ return enableIfModifiedsinceHeader;
+ }
+
+ public int getMaxContent() {
+ return maxContent;
+ }
+
+ public String getUserAgent() {
+ if (userAgentNames!=null) {
+ return userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1));
+ }
+ return userAgent;
+ }
+
+ /**
+ * Value of "Accept-Language" request header sent by Nutch.
+ *
+ * @return The value of the header "Accept-Language" header.
+ */
+ public String getAcceptLanguage() {
+ return acceptLanguage;
+ }
+
+ public String getAccept() {
+ return accept;
+ }
+
+ public boolean getUseHttp11() {
+ return useHttp11;
+ }
+
+ public Set<String> getTlsPreferredCipherSuites() {
+ return tlsPreferredCipherSuites;
+ }
+
+ public Set<String> getTlsPreferredProtocols() {
+ return tlsPreferredProtocols;
+ }
+
+ private static String getAgentString(String agentName, String agentVersion,
+ String agentDesc, String agentURL, String agentEmail) {
+
+ if ((agentName == null) || (agentName.trim().length() == 0)) {
+ // TODO : NUTCH-258
+ if (LOGGER.isErrorEnabled()) {
+ LOGGER.error("No User-Agent string set (http.agent.name)!");
+ }
+ }
+
+ StringBuffer buf = new StringBuffer();
+
+ buf.append(agentName);
+ if (agentVersion != null) {
+ buf.append("/");
+ buf.append(agentVersion);
+ }
+ if (((agentDesc != null) && (agentDesc.length() != 0))
+ || ((agentEmail != null) && (agentEmail.length() != 0))
+ || ((agentURL != null) && (agentURL.length() != 0))) {
+ buf.append(" (");
+
+ if ((agentDesc != null) && (agentDesc.length() != 0)) {
+ buf.append(agentDesc);
+ if ((agentURL != null) || (agentEmail != null))
+ buf.append("; ");
+ }
+
+ if ((agentURL != null) && (agentURL.length() != 0)) {
+ buf.append(agentURL);
+ if (agentEmail != null)
+ buf.append("; ");
+ }
+
+ if ((agentEmail != null) && (agentEmail.length() != 0))
+ buf.append(agentEmail);
+
+ buf.append(")");
+ }
+ return buf.toString();
+ }
+
+ protected void logConf() {
+ if (logger.isInfoEnabled()) {
+ logger.info("http.proxy.host = " + proxyHost);
+ logger.info("http.proxy.port = " + proxyPort);
+ logger.info("http.proxy.exception.list = " + useProxy);
+ logger.info("http.timeout = " + timeout);
+ logger.info("http.content.limit = " + maxContent);
+ logger.info("http.agent = " + userAgent);
+ logger.info("http.accept.language = " + acceptLanguage);
+ logger.info("http.accept = " + accept);
+ }
+ }
+
+ public byte[] processGzipEncoded(byte[] compressed, URL url)
+ throws IOException {
+
+ if (LOGGER.isTraceEnabled()) {
+ LOGGER.trace("uncompressing....");
+ }
+
+ // content can be empty (i.e. redirection) in which case
+ // there is nothing to unzip
+ if (compressed.length == 0)
+ return compressed;
+
+ byte[] content;
+ if (getMaxContent() >= 0) {
+ content = GZIPUtils.unzipBestEffort(compressed, getMaxContent());
+ } else {
+ content = GZIPUtils.unzipBestEffort(compressed);
+ }
+
+ if (content == null)
+ throw new IOException("unzipBestEffort returned null");
+
+ if (LOGGER.isTraceEnabled()) {
+ LOGGER.trace("fetched " + compressed.length
+ + " bytes of compressed content (expanded to " + content.length
+ + " bytes) from " + url);
+ }
+ return content;
+ }
+
+ public byte[] processDeflateEncoded(byte[] compressed, URL url)
+ throws IOException {
+
+ // content can be empty (i.e. redirection) in which case
+ // there is nothing to deflate
+ if (compressed.length == 0)
+ return compressed;
+
+ if (LOGGER.isTraceEnabled()) {
+ LOGGER.trace("inflating....");
+ }
+
+ byte[] content;
+ if (getMaxContent() >= 0) {
+ content = DeflateUtils.inflateBestEffort(compressed, getMaxContent());
+ } else {
+ content = DeflateUtils.inflateBestEffort(compressed);
+ }
+
+ if (content == null)
+ throw new IOException("inflateBestEffort returned null");
+
+ if (LOGGER.isTraceEnabled()) {
+ LOGGER.trace("fetched " + compressed.length
+ + " bytes of compressed content (expanded to " + content.length
+ + " bytes) from " + url);
+ }
+ return content;
+ }
+
+ protected static void main(HttpBase http, String[] args) throws Exception {
+ boolean verbose = false;
+ String url = null;
+
+ String usage = "Usage: Http [-verbose] [-timeout N] url";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ for (int i = 0; i < args.length; i++) { // parse command line
+ if (args[i].equals("-timeout")) { // found -timeout option
+ http.timeout = Integer.parseInt(args[++i]) * 1000;
+ } else if (args[i].equals("-verbose")) { // found -verbose option
+ verbose = true;
+ } else if (i != args.length - 1) {
+ System.err.println(usage);
+ System.exit(-1);
+ } else
+ // root is required parameter
+ url = args[i];
+ }
+
+ // if (verbose) {
+ // LOGGER.setLevel(Level.FINE);
+ // }
+
+ ProtocolOutput out = http
+ .getProtocolOutput(new Text(url), new CrawlDatum());
+ Content content = out.getContent();
+
+ System.out.println("Status: " + out.getStatus());
+ if (content != null) {
+ System.out.println("Content Type: " + content.getContentType());
+ System.out.println("Content Length: "
+ + content.getMetadata().get(Response.CONTENT_LENGTH));
+ System.out.println("Content:");
+ String text = new String(content.getContent());
+ System.out.println(text);
+ }
+ }
+
+ protected abstract Response getResponse(URL url, CrawlDatum datum,
+ boolean followRedirects) throws ProtocolException, IOException;
+
+ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+ return robots.getRobotRulesSet(this, url);
+ }
+
+ /**
+ * Transforming a String[] into a HashMap for faster searching
+ * @param input String[]
+ * @return a new HashMap
+ */
+ private HashMap arrayToMap(String[]input){
+ if (input==null ||input.length==0) {
+ return new HashMap();
+ }
+ HashMap hm=new HashMap();
+ for (int i=0;i<input.length;i++){
+ if (!"".equals(input[i].trim())){
+ hm.put(input[i],input[i]);
+ }
+ }
+ return hm;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java
new file mode 100644
index 0000000..ff7ef5b
--- /dev/null
+++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http.api;
+
+// Nutch imports
+import org.apache.nutch.protocol.ProtocolException;
+
+public class HttpException extends ProtocolException {
+
+ public HttpException() {
+ super();
+ }
+
+ public HttpException(String message) {
+ super(message);
+ }
+
+ public HttpException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ public HttpException(Throwable cause) {
+ super(cause);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
new file mode 100644
index 0000000..185ca15
--- /dev/null
+++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.RobotRulesParser;
+
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to HTTP protocol. It
+ * extends the generic {@link RobotRulesParser} class and contains Http protocol
+ * specific implementation for obtaining the robots file.
+ */
+public class HttpRobotRulesParser extends RobotRulesParser {
+
+ public static final Logger LOG = LoggerFactory
+ .getLogger(HttpRobotRulesParser.class);
+ protected boolean allowForbidden = false;
+
+ HttpRobotRulesParser() {
+ }
+
+ public HttpRobotRulesParser(Configuration conf) {
+ setConf(conf);
+ }
+
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ allowForbidden = conf.getBoolean("http.robots.403.allow", true);
+ }
+
+ /** Compose unique key to store and access robot rules in cache for given URL */
+ protected static String getCacheKey(URL url) {
+ String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+ // case
+ String host = url.getHost().toLowerCase(); // normalize to lower case
+ int port = url.getPort();
+ if (port == -1) {
+ port = url.getDefaultPort();
+ }
+ /*
+ * Robot rules apply only to host, protocol, and port where robots.txt is
+ * hosted (cf. NUTCH-1752). Consequently
+ */
+ String cacheKey = protocol + ":" + host + ":" + port;
+ return cacheKey;
+ }
+
+ /**
+ * Get the rules from robots.txt which applies for the given {@code url}.
+ * Robot rules are cached for a unique combination of host, protocol, and
+ * port. If no rules are found in the cache, a HTTP request is send to fetch
+ * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
+ * rules are cached to avoid re-fetching and re-parsing it again.
+ *
+ * @param http
+ * The {@link Protocol} object
+ * @param url
+ * URL robots.txt applies to
+ *
+ * @return {@link BaseRobotRules} holding the rules from robots.txt
+ */
+ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
+
+ if (LOG.isTraceEnabled() && isWhiteListed(url)) {
+ LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
+ }
+
+ String cacheKey = getCacheKey(url);
+ BaseRobotRules robotRules = CACHE.get(cacheKey);
+
+ if (robotRules != null) {
+ return robotRules; // cached rule
+ } else if (LOG.isTraceEnabled()) {
+ LOG.trace("cache miss " + url);
+ }
+
+ boolean cacheRule = true;
+ URL redir = null;
+
+ if (isWhiteListed(url)) {
+ // check in advance whether a host is whitelisted
+ // (we do not need to fetch robots.txt)
+ robotRules = EMPTY_RULES;
+ LOG.info("Whitelisted host found for: {}", url);
+ LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}",
+ url.getHost());
+
+ } else {
+ try {
+ Response response = ((HttpBase) http).getResponse(new URL(url,
+ "/robots.txt"), new CrawlDatum(), true);
+ // try one level of redirection ?
+ if (response.getCode() == 301 || response.getCode() == 302) {
+ String redirection = response.getHeader("Location");
+ if (redirection == null) {
+ // some versions of MS IIS are known to mangle this header
+ redirection = response.getHeader("location");
+ }
+ if (redirection != null) {
+ if (!redirection.startsWith("http")) {
+ // RFC says it should be absolute, but apparently it isn't
+ redir = new URL(url, redirection);
+ } else {
+ redir = new URL(redirection);
+ }
+
+ response = ((HttpBase) http).getResponse(redir, new CrawlDatum(),
+ true);
+ }
+ }
+
+ if (response.getCode() == 200) // found rules: parse them
+ robotRules = parseRules(url.toString(), response.getContent(),
+ response.getHeader("Content-Type"), agentNames);
+
+ else if ((response.getCode() == 403) && (!allowForbidden))
+ robotRules = FORBID_ALL_RULES; // use forbid all
+ else if (response.getCode() >= 500) {
+ cacheRule = false; // try again later to fetch robots.txt
+ robotRules = EMPTY_RULES;
+ } else
+ robotRules = EMPTY_RULES; // use default rules
+ } catch (Throwable t) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+ }
+ cacheRule = false; // try again later to fetch robots.txt
+ robotRules = EMPTY_RULES;
+ }
+ }
+
+ if (cacheRule) {
+ CACHE.put(cacheKey, robotRules); // cache rules for host
+ if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
+ // cache also for the redirected host
+ CACHE.put(getCacheKey(redir), robotRules);
+ }
+ }
+
+ return robotRules;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html
new file mode 100644
index 0000000..972bb3c
--- /dev/null
+++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>Common API used by HTTP plugins ({@link org.apache.nutch.protocol.http http},
+{@link org.apache.nutch.protocol.httpclient httpclient})</p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java b/nutch-plugins/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
new file mode 100644
index 0000000..23e4ef6
--- /dev/null
+++ b/nutch-plugins/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * JUnit test case which tests 1. that robots filtering is performed correctly
+ * as per the agent name 2. that crawl delay is extracted correctly from the
+ * robots file
+ *
+ */
+public class TestRobotRulesParser {
+
+ private static final String CONTENT_TYPE = "text/plain";
+ private static final String SINGLE_AGENT = "Agent1";
+ private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
+ private static final String UNKNOWN_AGENT = "AgentABC";
+ private static final String CR = "\r";
+
+ private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR
+ + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c"
+ + CR
+ + "Crawl-delay: 10"
+ + CR // set crawl delay for Agent1 as 10 sec
+ + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh"
+ + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20"
+ + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no
+ // crawl
+ // delay
+ // for
+ // other
+ // agents
+
+ private static final String[] TEST_PATHS = new String[] {
+ "http://example.com/a", "http://example.com/a/bloh/foo.html",
+ "http://example.com/b", "http://example.com/c",
+ "http://example.com/b/a/index.html",
+ "http://example.com/foo/bar/baz.html" };
+
+ private static final boolean[] RESULTS = new boolean[] { false, // /a
+ false, // /a/bloh/foo.html
+ true, // /b
+ true, // /c
+ false, // /b/a/index.html
+ true // /foo/bar/baz.html
+ };
+
+ private HttpRobotRulesParser parser;
+ private BaseRobotRules rules;
+
+ public TestRobotRulesParser() {
+ parser = new HttpRobotRulesParser();
+ }
+
+ /**
+ * Test that the robots rules are interpreted correctly by the robots rules
+ * parser.
+ */
+ @Test
+ public void testRobotsAgent() {
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, SINGLE_AGENT);
+
+ for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+ Assert.assertTrue(
+ "testing on agent (" + SINGLE_AGENT + "), and " + "path "
+ + TEST_PATHS[counter] + " got "
+ + rules.isAllowed(TEST_PATHS[counter]),
+ rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+ }
+
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, MULTIPLE_AGENTS);
+
+ for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+ Assert.assertTrue(
+ "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
+ + TEST_PATHS[counter] + " got "
+ + rules.isAllowed(TEST_PATHS[counter]),
+ rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+ }
+ }
+
+ /**
+ * Test that the crawl delay is extracted from the robots file for respective
+ * agent. If its not specified for a given agent, default value must be
+ * returned.
+ */
+ @Test
+ public void testCrawlDelay() {
+ // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
+ // returned by the parser
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, SINGLE_AGENT);
+ Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
+ (rules.getCrawlDelay() == 10000));
+
+ // for UNKNOWN_AGENT, the default crawl delay must be returned.
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, UNKNOWN_AGENT);
+ Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
+ (rules.getCrawlDelay() == Long.MIN_VALUE));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-nekohtml/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-nekohtml/build.xml b/nutch-plugins/lib-nekohtml/build.xml
new file mode 100644
index 0000000..4bca1af
--- /dev/null
+++ b/nutch-plugins/lib-nekohtml/build.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-nekohtml" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+ <!--
+ ! Override the compile and jar targets,
+ ! since there is nothing to compile here.
+ ! -->
+ <target name="compile" depends="init, resolve-default"/>
+
+ <target name="jar" depends="compile"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-nekohtml/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-nekohtml/ivy.xml b/nutch-plugins/lib-nekohtml/ivy.xml
new file mode 100644
index 0000000..ed70b80
--- /dev/null
+++ b/nutch-plugins/lib-nekohtml/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.19" conf="*->master"/>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-nekohtml/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-nekohtml/plugin.xml b/nutch-plugins/lib-nekohtml/plugin.xml
new file mode 100644
index 0000000..513c9a7
--- /dev/null
+++ b/nutch-plugins/lib-nekohtml/plugin.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! NekoHTML is a simple HTML scanner and tag balancer that enables
+ ! application programmers to parse HTML documents and access the
+ ! information using standard XML interfaces.
+ ! (http://sourceforge.net/projects/nekohtml/)
+ !
+ ! License : https://nekohtml.svn.sourceforge.net/svnroot/nekohtml/trunk/LICENSE.txt
+ !-->
+<plugin
+ id="lib-nekohtml"
+ name="CyberNeko HTML Parser"
+ version="1.9.19"
+ provider-name="net.sourceforge.nekohtml">
+
+ <runtime>
+ <library name="nekohtml-1.9.19.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-nekohtml/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-nekohtml/pom.xml b/nutch-plugins/lib-nekohtml/pom.xml
new file mode 100644
index 0000000..e51d61d
--- /dev/null
+++ b/nutch-plugins/lib-nekohtml/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>lib-nekohtml</artifactId>
+ <packaging>jar</packaging>
+
+ <name>lib-nekohtml</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-regex-filter/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/build.xml b/nutch-plugins/lib-regex-filter/build.xml
new file mode 100644
index 0000000..9702ca2
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-regex-filter" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-regex-filter/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/ivy.xml b/nutch-plugins/lib-regex-filter/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-regex-filter/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/plugin.xml b/nutch-plugins/lib-regex-filter/plugin.xml
new file mode 100644
index 0000000..42de8f1
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/plugin.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for RegExp based URL filters
+ !-->
+<plugin
+ id="lib-regex-filter"
+ name="Regex URL Filter Framework"
+ version="1.0"
+ provider-name="org.apache.nutch">
+
+ <runtime>
+ <library name="lib-regex-filter.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-regex-filter/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/pom.xml b/nutch-plugins/lib-regex-filter/pom.xml
new file mode 100644
index 0000000..59f4b10
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>lib-regex-filter</artifactId>
+ <packaging>jar</packaging>
+
+ <name>lib-regex-filter</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java
new file mode 100644
index 0000000..e408586
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+/**
+ * A generic regular expression rule.
+ *
+ * @author Jérôme Charron
+ */
+public abstract class RegexRule {
+
+ private final boolean sign;
+
+ private final String hostOrDomain;
+
+ private final String regex;
+
+ /**
+ * Constructs a new regular expression rule.
+ *
+ * @param sign
+ * specifies if this rule must filter-in or filter-out. A
+ * <code>true</code> value means that any url matching this rule must
+ * be accepted, a <code>false</code> value means that any url
+ * matching this rule must be rejected.
+ * @param regex
+ * is the regular expression used for matching (see
+ * {@link #match(String)} method).
+ */
+ protected RegexRule(boolean sign, String regex) {
+ this(sign, regex, null);
+ }
+
+ /**
+ * Constructs a new regular expression rule.
+ *
+ * @param sign
+ * specifies if this rule must filter-in or filter-out. A
+ * <code>true</code> value means that any url matching this rule must
+ * be accepted, a <code>false</code> value means that any url
+ * matching this rule must be rejected.
+ * @param regex
+ * is the regular expression used for matching (see
+ * {@link #match(String)} method).
+ * @param hostOrDomain
+ * the host or domain to which this regex belongs
+ */
+ protected RegexRule(boolean sign, String regex, String hostOrDomain) {
+ this.sign = sign;
+ this.hostOrDomain = hostOrDomain;
+ this.regex = regex;
+ }
+
+ /**
+ * Return if this rule is used for filtering-in or out.
+ *
+ * @return <code>true</code> if any url matching this rule must be accepted,
+ * otherwise <code>false</code>.
+ */
+ protected boolean accept() {
+ return sign;
+ }
+
+ /**
+ * Return if this rule is used for filtering-in or out.
+ *
+ * @return host or domain this regex rule belongs to
+ */
+ protected String hostOrDomain() { return hostOrDomain; }
+
+ /**
+ * Return if this rule's regex.
+ *
+ * @return this regex
+ */
+ protected String regex() { return regex; }
+
+ /**
+ * Checks if a url matches this rule.
+ *
+ * @param url
+ * is the url to check.
+ * @return <code>true</code> if the specified url matches this rule, otherwise
+ * <code>false</code>.
+ */
+ protected abstract boolean match(String url);
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
new file mode 100644
index 0000000..f5cc081
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -0,0 +1,315 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+// JDK imports
+import java.io.File;
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.util.List;
+import java.util.ArrayList;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.net.*;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
+ * expressions.
+ *
+ * <p>
+ * The regular expressions rules are expressed in a file. The file of rules is
+ * determined for each implementation using the
+ * {@link #getRulesReader(Configuration conf)} method.
+ * </p>
+ *
+ * <p>
+ * The format of this file is made of many rules (one per line):<br/>
+ * <code>
+ * [+-]<regex>
+ * </code><br/>
+ * where plus (<code>+</code>)means go ahead and index it and minus (
+ * <code>-</code>)means no.
+ * </p>
+ *
+ * @author Jérôme Charron
+ */
+public abstract class RegexURLFilterBase implements URLFilter {
+
+ /** My logger */
+ private final static Logger LOG = LoggerFactory
+ .getLogger(RegexURLFilterBase.class);
+
+ /** An array of applicable rules */
+ private List<RegexRule> rules;
+
+ /** The current configuration */
+ private Configuration conf;
+
+ /**
+ * Constructs a new empty RegexURLFilterBase
+ */
+ public RegexURLFilterBase() {
+ }
+
+ /**
+ * Constructs a new RegexURLFilter and init it with a file of rules.
+ *
+ * @param filename
+ * is the name of rules file.
+ */
+ public RegexURLFilterBase(File filename) throws IOException,
+ IllegalArgumentException {
+ this(new FileReader(filename));
+ }
+
+ /**
+ * Constructs a new RegexURLFilter and inits it with a list of rules.
+ *
+ * @param rules
+ * string with a list of rules, one rule per line
+ * @throws IOException
+ * @throws IllegalArgumentException
+ */
+ public RegexURLFilterBase(String rules) throws IOException,
+ IllegalArgumentException {
+ this(new StringReader(rules));
+ }
+
+ /**
+ * Constructs a new RegexURLFilter and init it with a Reader of rules.
+ *
+ * @param reader
+ * is a reader of rules.
+ */
+ protected RegexURLFilterBase(Reader reader) throws IOException,
+ IllegalArgumentException {
+ rules = readRules(reader);
+ }
+
+ /**
+ * Creates a new {@link RegexRule}.
+ *
+ * @param sign
+ * of the regular expression. A <code>true</code> value means that
+ * any URL matching this rule must be included, whereas a
+ * <code>false</code> value means that any URL matching this rule
+ * must be excluded.
+ * @param regex
+ * is the regular expression associated to this rule.
+ */
+ protected abstract RegexRule createRule(boolean sign, String regex);
+
+ /**
+ * Creates a new {@link RegexRule}.
+ * @param
+ * sign of the regular expression.
+ * A <code>true</code> value means that any URL matching this rule
+ * must be included, whereas a <code>false</code>
+ * value means that any URL matching this rule must be excluded.
+ * @param regex
+ * is the regular expression associated to this rule.
+ * @param hostOrDomain
+ * the host or domain to which this regex belongs
+ */
+ protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain);
+
+ /**
+ * Returns the name of the file of rules to use for a particular
+ * implementation.
+ *
+ * @param conf
+ * is the current configuration.
+ * @return the name of the resource containing the rules to use.
+ */
+ protected abstract Reader getRulesReader(Configuration conf)
+ throws IOException;
+
+ /*
+ * -------------------------- * <implementation:URLFilter> *
+ * --------------------------
+ */
+
+ // Inherited Javadoc
+ public String filter(String url) {
+ String host = URLUtil.getHost(url);
+ String domain = null;
+
+ try {
+ domain = URLUtil.getDomainName(url);
+ } catch (MalformedURLException e) {
+ // shouldnt happen here right?
+ }
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("URL belongs to host " + host + " and domain " + domain);
+ }
+
+ for (RegexRule rule : rules) {
+ // Skip the skip for rules that don't share the same host and domain
+ if (rule.hostOrDomain() != null &&
+ !rule.hostOrDomain().equals(host) &&
+ !rule.hostOrDomain().equals(domain)) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain());
+ }
+
+ continue;
+ }
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain);
+ }
+
+ if (rule.match(url)) {
+ return rule.accept() ? url : null;
+ }
+ }
+ ;
+ return null;
+ }
+
+ /*
+ * --------------------------- * </implementation:URLFilter> *
+ * ---------------------------
+ */
+
+ /*
+ * ----------------------------- * <implementation:Configurable> *
+ * -----------------------------
+ */
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ Reader reader = null;
+ try {
+ reader = getRulesReader(conf);
+ } catch (Exception e) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ try {
+ rules = readRules(reader);
+ } catch (IOException e) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /*
+ * ------------------------------ * </implementation:Configurable> *
+ * ------------------------------
+ */
+
+ /**
+ * Read the specified file of rules.
+ *
+ * @param reader
+ * is a reader of regular expressions rules.
+ * @return the corresponding {@RegexRule rules}.
+ */
+ private List<RegexRule> readRules(Reader reader) throws IOException,
+ IllegalArgumentException {
+
+ BufferedReader in = new BufferedReader(reader);
+ List<RegexRule> rules = new ArrayList<RegexRule>();
+ String line;
+ String hostOrDomain = null;
+
+ while ((line = in.readLine()) != null) {
+ if (line.length() == 0) {
+ continue;
+ }
+ char first = line.charAt(0);
+ boolean sign = false;
+ switch (first) {
+ case '+':
+ sign = true;
+ break;
+ case '-':
+ sign = false;
+ break;
+ case ' ':
+ case '\n':
+ case '#': // skip blank & comment lines
+ continue;
+ case '>':
+ hostOrDomain = line.substring(1).trim();
+ continue;
+ case '<':
+ hostOrDomain = null;
+ continue;
+ default:
+ throw new IOException("Invalid first character: " + line);
+ }
+
+ String regex = line.substring(1);
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Adding rule [" + regex + "] for " + hostOrDomain);
+ }
+ RegexRule rule = createRule(sign, regex, hostOrDomain);
+ rules.add(rule);
+ }
+ return rules;
+ }
+
+ /**
+ * Filter the standard input using a RegexURLFilterBase.
+ *
+ * @param filter
+ * is the RegexURLFilterBase to use for filtering the standard input.
+ * @param args
+ * some optional parameters (not used).
+ */
+ public static void main(RegexURLFilterBase filter, String args[])
+ throws IOException, IllegalArgumentException {
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ String line;
+ while ((line = in.readLine()) != null) {
+ String out = filter.filter(line);
+ if (out != null) {
+ System.out.print("+");
+ System.out.println(out);
+ } else {
+ System.out.print("-");
+ System.out.println(line);
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java
new file mode 100644
index 0000000..b849353
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} library,
+ * abstracting away from regular expression implementations.
+ */
+package org.apache.nutch.urlfilter.api;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
new file mode 100644
index 0000000..0b58231
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+// JDK imports
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.nutch.net.URLFilter;
+
+/**
+ * JUnit based test of class <code>RegexURLFilterBase</code>.
+ *
+ * @author Jérôme Charron
+ */
+public abstract class RegexURLFilterBaseTest {
+
+ /** My logger */
+ protected static final Logger LOG = LoggerFactory
+ .getLogger(RegexURLFilterBaseTest.class);
+
+ private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SAMPLES = System.getProperty("test.data", ".");
+
+ protected abstract URLFilter getURLFilter(Reader rules);
+
+ protected void bench(int loops, String file) {
+ try {
+ bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+ new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+ } catch (Exception e) {
+ Assert.fail(e.toString());
+ }
+ }
+
+ protected void bench(int loops, Reader rules, Reader urls) {
+ long start = System.currentTimeMillis();
+ try {
+ URLFilter filter = getURLFilter(rules);
+ FilteredURL[] expected = readURLFile(urls);
+ for (int i = 0; i < loops; i++) {
+ test(filter, expected);
+ }
+ } catch (Exception e) {
+ Assert.fail(e.toString());
+ }
+ LOG.info("bench time (" + loops + ") "
+ + (System.currentTimeMillis() - start) + "ms");
+ }
+
+ protected void test(String file) {
+ try {
+ test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+ new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+ } catch (Exception e) {
+ Assert.fail(e.toString());
+ }
+ }
+
+ protected void test(Reader rules, Reader urls) {
+ try {
+ test(getURLFilter(rules), readURLFile(urls));
+ } catch (Exception e) {
+ Assert.fail(e.toString());
+ }
+ }
+
+ protected void test(URLFilter filter, FilteredURL[] expected) {
+ for (int i = 0; i < expected.length; i++) {
+ String result = filter.filter(expected[i].url);
+ if (result != null) {
+ Assert.assertTrue(expected[i].url, expected[i].sign);
+ } else {
+ Assert.assertFalse(expected[i].url, expected[i].sign);
+ }
+ }
+ }
+
+ private static FilteredURL[] readURLFile(Reader reader) throws IOException {
+ BufferedReader in = new BufferedReader(reader);
+ List<FilteredURL> list = new ArrayList<FilteredURL>();
+ String line;
+ while ((line = in.readLine()) != null) {
+ if (line.length() != 0) {
+ list.add(new FilteredURL(line));
+ }
+ }
+ return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]);
+ }
+
+ private static class FilteredURL {
+
+ boolean sign;
+ String url;
+
+ FilteredURL(String line) {
+ switch (line.charAt(0)) {
+ case '+':
+ sign = true;
+ break;
+ case '-':
+ sign = false;
+ break;
+ default:
+ // Simply ignore...
+ }
+ url = line.substring(1);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-selenium/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/build-ivy.xml b/nutch-plugins/lib-selenium/build-ivy.xml
new file mode 100644
index 0000000..3abcf6d
--- /dev/null
+++ b/nutch-plugins/lib-selenium/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+ <property name="ivy.install.version" value="2.1.0" />
+ <condition property="ivy.home" value="${env.IVY_HOME}">
+ <isset property="env.IVY_HOME" />
+ </condition>
+ <property name="ivy.home" value="${user.home}/.ant" />
+ <property name="ivy.checksums" value="" />
+ <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+ <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+ <target name="download-ivy" unless="offline">
+
+ <mkdir dir="${ivy.jar.dir}"/>
+ <!-- download Ivy from web site so that it can be used even without any special installation -->
+ <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
+ dest="${ivy.jar.file}" usetimestamp="true"/>
+ </target>
+
+ <target name="init-ivy" depends="download-ivy">
+ <!-- try to load ivy here from ivy home, in case the user has not already dropped
+ it into ant's lib dir (note that the latter copy will always take precedence).
+ We will not fail as long as local lib dir exists (it may be empty) and
+ ivy is in at least one of ant's lib dir or the local lib dir. -->
+ <path id="ivy.lib.path">
+ <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+ </path>
+ <taskdef resource="org/apache/ivy/ant/antlib.xml"
+ uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+ </target>
+
+ <target name="deps-jar" depends="init-ivy">
+ <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+ </target>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-selenium/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/build.xml b/nutch-plugins/lib-selenium/build.xml
new file mode 100644
index 0000000..7c6d98d
--- /dev/null
+++ b/nutch-plugins/lib-selenium/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-selenium" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-http/*.jar" />
+ </fileset>
+ </path>
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt b/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt
new file mode 100644
index 0000000..1892a62
--- /dev/null
+++ b/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt
@@ -0,0 +1,15 @@
+1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml
+
+2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
+
+ To get a list of dependencies and their versions execute:
+ $ ant -f ./build-ivy.xml
+ $ ls lib | sed 's/^/ <library name="/g' | sed 's/$/">\n <export name="*"\/>\n <\/library>/g'
+
+ Note that all dependent libraries are exported for a "library" plugin ("lib-selenium").
+
+ N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows
+
+ $ brew install gnu-sed --with-default-names
+
+ You can then restart your terminal and the Regex + Sed command should work just fine!
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-selenium/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/ivy.xml b/nutch-plugins/lib-selenium/ivy.xml
new file mode 100644
index 0000000..701b725
--- /dev/null
+++ b/nutch-plugins/lib-selenium/ivy.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../../ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ <!-- begin selenium dependencies -->
+ <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.48.2" />
+
+ <dependency org="com.opera" name="operadriver" rev="1.5">
+ <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+ </dependency>
+ <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
+ <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+ <exclude org="org.seleniumhq.selenium" name="selenium-java" />
+ </dependency>
+ <!-- end selenium dependencies -->
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-selenium/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/plugin.xml b/nutch-plugins/lib-selenium/plugin.xml
new file mode 100644
index 0000000..a86d665
--- /dev/null
+++ b/nutch-plugins/lib-selenium/plugin.xml
@@ -0,0 +1,175 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for http protocol implementations
+ !-->
+<plugin
+ id="lib-selenium"
+ name="HTTP Framework"
+ version="1.0"
+ provider-name="org.apache.nutch">
+
+ <runtime>
+ <library name="lib-selenium.jar">
+ <export name="*"/>
+ </library>
+ <!-- all classes from dependent libraries are exported -->
+ <library name="cglib-nodep-2.1_3.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-codec-1.10.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-collections-3.2.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-exec-1.3.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-io-2.4.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-jxpath-1.3.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-lang3-3.4.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-logging-1.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="cssparser-0.9.16.jar">
+ <export name="*"/>
+ </library>
+ <library name="gson-2.3.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="guava-18.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="htmlunit-2.18.jar">
+ <export name="*"/>
+ </library>
+ <library name="htmlunit-core-js-2.17.jar">
+ <export name="*"/>
+ </library>
+ <library name="httpclient-4.5.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="httpcore-4.4.3.jar">
+ <export name="*"/>
+ </library>
+ <library name="httpmime-4.5.jar">
+ <export name="*"/>
+ </library>
+ <library name="ini4j-0.5.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="jetty-io-9.2.12.v20150709.jar">
+ <export name="*"/>
+ </library>
+ <library name="jetty-util-9.2.12.v20150709.jar">
+ <export name="*"/>
+ </library>
+ <library name="jna-4.1.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="jna-platform-4.1.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="nekohtml-1.9.22.jar">
+ <export name="*"/>
+ </library>
+ <library name="netty-3.5.2.Final.jar">
+ <export name="*"/>
+ </library>
+ <library name="operadriver-1.5.jar">
+ <export name="*"/>
+ </library>
+ <library name="operalaunchers-1.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="phantomjsdriver-1.2.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="protobuf-java-2.4.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="sac-1.3.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-api-2.48.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-chrome-driver-2.48.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-edge-driver-2.48.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-firefox-driver-2.48.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-htmlunit-driver-2.48.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-ie-driver-2.48.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-java-2.48.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-leg-rc-2.48.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-remote-driver-2.48.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-safari-driver-2.48.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-support-2.48.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="serializer-2.7.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="webbit-0.4.14.jar">
+ <export name="*"/>
+ </library>
+ <library name="websocket-api-9.2.12.v20150709.jar">
+ <export name="*"/>
+ </library>
+ <library name="websocket-client-9.2.12.v20150709.jar">
+ <export name="*"/>
+ </library>
+ <library name="websocket-common-9.2.12.v20150709.jar">
+ <export name="*"/>
+ </library>
+ <library name="xalan-2.7.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="xercesImpl-2.11.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="xml-apis-1.4.01.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-selenium/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/pom.xml b/nutch-plugins/lib-selenium/pom.xml
new file mode 100644
index 0000000..fed912d
--- /dev/null
+++ b/nutch-plugins/lib-selenium/pom.xml
@@ -0,0 +1,49 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>lib-selenium</artifactId>
+ <packaging>jar</packaging>
+
+ <name>lib-selenium</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+ <dependencies>
+ <dependency>
+ <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-java</artifactId> <version>2.48.2</version>
+ </dependency>
+ <dependency>
+ <groupId>com.opera</groupId> <artifactId>operadriver</artifactId> <version>1.5</version>
+ </dependency>
+ <dependency>
+ <groupId>com.codeborne</groupId> <artifactId>phantomjsdriver</artifactId> <version>1.2.1</version>
+ </dependency>
+ </dependencies>
+
+</project>