You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:57 UTC
[13/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
deleted file mode 100644
index 9f616fe..0000000
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ /dev/null
@@ -1,587 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.http.api;
-
-// JDK imports
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.Reader;
-import java.net.URL;
-import java.util.*;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.concurrent.ThreadLocalRandom;
-// Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.util.GZIPUtils;
-import org.apache.nutch.util.DeflateUtils;
-import org.apache.hadoop.util.StringUtils;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Text;
-
-// crawler-commons imports
-import crawlercommons.robots.BaseRobotRules;
-
-public abstract class HttpBase implements Protocol {
-
- public static final Text RESPONSE_TIME = new Text("_rs_");
-
- public static final int BUFFER_SIZE = 8 * 1024;
-
- private static final byte[] EMPTY_CONTENT = new byte[0];
-
- private HttpRobotRulesParser robots = null;
-
- private ArrayList<String> userAgentNames = null;
-
- /** The proxy hostname. */
- protected String proxyHost = null;
-
- /** The proxy port. */
- protected int proxyPort = 8080;
-
- /** The proxy exception list. */
- protected HashMap proxyException = new HashMap();
-
- /** Indicates if a proxy is used */
- protected boolean useProxy = false;
-
- /** The network timeout in millisecond */
- protected int timeout = 10000;
-
- /** The length limit for downloaded content, in bytes. */
- protected int maxContent = 64 * 1024;
-
- /** The Nutch 'User-Agent' request header */
- protected String userAgent = getAgentString("NutchCVS", null, "Nutch",
- "http://nutch.apache.org/bot.html", "agent@nutch.apache.org");
-
- /** The "Accept-Language" request header value. */
- protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
-
- /** The "Accept" request header value. */
- protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
-
- /** The default logger */
- private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
-
- /** The specified logger */
- private Logger logger = LOGGER;
-
- /** The nutch configuration */
- private Configuration conf = null;
-
- /** Do we use HTTP/1.1? */
- protected boolean useHttp11 = false;
-
- /**
- * Record response time in CrawlDatum's meta data, see property
- * http.store.responsetime.
- */
- protected boolean responseTime = true;
-
- /** Skip page if Crawl-Delay longer than this value. */
- protected long maxCrawlDelay = -1L;
-
- /** Which TLS/SSL protocols to support */
- protected Set<String> tlsPreferredProtocols;
-
- /** Which TLS/SSL cipher suites to support */
- protected Set<String> tlsPreferredCipherSuites;
-
- /** Configuration directive for If-Modified-Since HTTP header */
- public boolean enableIfModifiedsinceHeader = true;
-
- /** Creates a new instance of HttpBase */
- public HttpBase() {
- this(null);
- }
-
- /** Creates a new instance of HttpBase */
- public HttpBase(Logger logger) {
- if (logger != null) {
- this.logger = logger;
- }
- robots = new HttpRobotRulesParser();
- }
-
- // Inherited Javadoc
- public void setConf(Configuration conf) {
- this.conf = conf;
- this.proxyHost = conf.get("http.proxy.host");
- this.proxyPort = conf.getInt("http.proxy.port", 8080);
- this.proxyException = arrayToMap(conf.getStrings("http.proxy.exception.list"));
- this.useProxy = (proxyHost != null && proxyHost.length() > 0);
- this.timeout = conf.getInt("http.timeout", 10000);
- this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
- this.userAgent = getAgentString(conf.get("http.agent.name"),
- conf.get("http.agent.version"), conf.get("http.agent.description"),
- conf.get("http.agent.url"), conf.get("http.agent.email"));
- this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
- this.accept = conf.get("http.accept", accept);
- // backward-compatible default setting
- this.useHttp11 = conf.getBoolean("http.useHttp11", false);
- this.responseTime = conf.getBoolean("http.store.responsetime", true);
- this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
- this.robots.setConf(conf);
-
- // NUTCH-1941: read list of alternating agent names
- if (conf.getBoolean("http.agent.rotate", false)) {
- String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
- BufferedReader br = null;
- try {
- Reader reader = conf.getConfResourceAsReader(agentsFile);
- br = new BufferedReader(reader);
- userAgentNames = new ArrayList<String>();
- String word = "";
- while ((word = br.readLine()) != null) {
- if (!word.trim().isEmpty())
- userAgentNames.add(word.trim());
- }
-
- if (userAgentNames.size() == 0) {
- logger.warn("Empty list of user agents in http.agent.rotate.file {}",
- agentsFile);
- userAgentNames = null;
- }
-
- } catch (Exception e) {
- logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile,
- StringUtils.stringifyException(e));
- userAgentNames = null;
- } finally {
- if (br != null) {
- try {
- br.close();
- } catch (IOException e) {
- // ignore
- }
- }
- }
- if (userAgentNames == null) {
- logger
- .warn("Falling back to fixed user agent set via property http.agent.name");
- }
- }
-
- String[] protocols = conf.getStrings("http.tls.supported.protocols",
- "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
- String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
- "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
- "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
- "TLS_RSA_WITH_AES_256_CBC_SHA256",
- "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
- "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
- "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
- "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256",
- "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
- "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
- "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
- "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA",
- "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
- "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
- "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
- "TLS_RSA_WITH_AES_128_CBC_SHA256",
- "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
- "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256",
- "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
- "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256",
- "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
- "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
- "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA",
- "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
- "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA",
- "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA",
- "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
- "TLS_ECDH_RSA_WITH_RC4_128_SHA",
- "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA",
- "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA",
- "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
- "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
- "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA",
- "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5",
- "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256",
- "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA",
- "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA",
- "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5",
- "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA",
- "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
- "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA",
- "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA",
- "TLS_KRB5_WITH_DES_CBC_MD5");
-
- tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
- tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
-
- logConf();
- }
-
- // Inherited Javadoc
- public Configuration getConf() {
- return this.conf;
- }
-
- public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
-
- String urlString = url.toString();
- try {
- URL u = new URL(urlString);
-
- long startTime = System.currentTimeMillis();
- Response response = getResponse(u, datum, false); // make a request
-
- if (this.responseTime) {
- int elapsedTime = (int) (System.currentTimeMillis() - startTime);
- datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
- }
-
- int code = response.getCode();
- datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
- new Text(Integer.toString(code)));
-
- byte[] content = response.getContent();
- Content c = new Content(u.toString(), u.toString(),
- (content == null ? EMPTY_CONTENT : content),
- response.getHeader("Content-Type"), response.getHeaders(), this.conf);
-
- if (code == 200) { // got a good response
- return new ProtocolOutput(c); // return it
-
- } else if (code >= 300 && code < 400) { // handle redirect
- String location = response.getHeader("Location");
- // some broken servers, such as MS IIS, use lowercase header name...
- if (location == null)
- location = response.getHeader("location");
- if (location == null)
- location = "";
- u = new URL(u, location);
- int protocolStatusCode;
- switch (code) {
- case 300: // multiple choices, preferred value in Location
- protocolStatusCode = ProtocolStatus.MOVED;
- break;
- case 301: // moved permanently
- case 305: // use proxy (Location is URL of proxy)
- protocolStatusCode = ProtocolStatus.MOVED;
- break;
- case 302: // found (temporarily moved)
- case 303: // see other (redirect after POST)
- case 307: // temporary redirect
- protocolStatusCode = ProtocolStatus.TEMP_MOVED;
- break;
- case 304: // not modified
- protocolStatusCode = ProtocolStatus.NOTMODIFIED;
- break;
- default:
- protocolStatusCode = ProtocolStatus.MOVED;
- }
- // handle this in the higher layer.
- return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
- } else if (code == 400) { // bad request, mark as GONE
- if (logger.isTraceEnabled()) {
- logger.trace("400 Bad request: " + u);
- }
- return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
- } else if (code == 401) { // requires authorization, but no valid auth
- // provided.
- if (logger.isTraceEnabled()) {
- logger.trace("401 Authentication Required");
- }
- return new ProtocolOutput(c, new ProtocolStatus(
- ProtocolStatus.ACCESS_DENIED, "Authentication required: "
- + urlString));
- } else if (code == 404) {
- return new ProtocolOutput(c, new ProtocolStatus(
- ProtocolStatus.NOTFOUND, u));
- } else if (code == 410) { // permanently GONE
- return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
- "Http: " + code + " url=" + u));
- } else {
- return new ProtocolOutput(c, new ProtocolStatus(
- ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
- }
- } catch (Throwable e) {
- logger.error("Failed to get protocol output", e);
- return new ProtocolOutput(null, new ProtocolStatus(e));
- }
- }
-
- /*
- * -------------------------- * </implementation:Protocol> *
- * --------------------------
- */
-
- public String getProxyHost() {
- return proxyHost;
- }
-
- public int getProxyPort() {
- return proxyPort;
- }
-
- public boolean useProxy(URL url) {
- if (!useProxy){
- return false;
- } else if (proxyException.get(url.getHost())!=null){
- return false;
- }
- return useProxy;
- }
-
- public int getTimeout() {
- return timeout;
- }
-
- public boolean isIfModifiedSinceEnabled() {
- return enableIfModifiedsinceHeader;
- }
-
- public int getMaxContent() {
- return maxContent;
- }
-
- public String getUserAgent() {
- if (userAgentNames!=null) {
- return userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1));
- }
- return userAgent;
- }
-
- /**
- * Value of "Accept-Language" request header sent by Nutch.
- *
- * @return The value of the header "Accept-Language" header.
- */
- public String getAcceptLanguage() {
- return acceptLanguage;
- }
-
- public String getAccept() {
- return accept;
- }
-
- public boolean getUseHttp11() {
- return useHttp11;
- }
-
- public Set<String> getTlsPreferredCipherSuites() {
- return tlsPreferredCipherSuites;
- }
-
- public Set<String> getTlsPreferredProtocols() {
- return tlsPreferredProtocols;
- }
-
- private static String getAgentString(String agentName, String agentVersion,
- String agentDesc, String agentURL, String agentEmail) {
-
- if ((agentName == null) || (agentName.trim().length() == 0)) {
- // TODO : NUTCH-258
- if (LOGGER.isErrorEnabled()) {
- LOGGER.error("No User-Agent string set (http.agent.name)!");
- }
- }
-
- StringBuffer buf = new StringBuffer();
-
- buf.append(agentName);
- if (agentVersion != null) {
- buf.append("/");
- buf.append(agentVersion);
- }
- if (((agentDesc != null) && (agentDesc.length() != 0))
- || ((agentEmail != null) && (agentEmail.length() != 0))
- || ((agentURL != null) && (agentURL.length() != 0))) {
- buf.append(" (");
-
- if ((agentDesc != null) && (agentDesc.length() != 0)) {
- buf.append(agentDesc);
- if ((agentURL != null) || (agentEmail != null))
- buf.append("; ");
- }
-
- if ((agentURL != null) && (agentURL.length() != 0)) {
- buf.append(agentURL);
- if (agentEmail != null)
- buf.append("; ");
- }
-
- if ((agentEmail != null) && (agentEmail.length() != 0))
- buf.append(agentEmail);
-
- buf.append(")");
- }
- return buf.toString();
- }
-
- protected void logConf() {
- if (logger.isInfoEnabled()) {
- logger.info("http.proxy.host = " + proxyHost);
- logger.info("http.proxy.port = " + proxyPort);
- logger.info("http.proxy.exception.list = " + useProxy);
- logger.info("http.timeout = " + timeout);
- logger.info("http.content.limit = " + maxContent);
- logger.info("http.agent = " + userAgent);
- logger.info("http.accept.language = " + acceptLanguage);
- logger.info("http.accept = " + accept);
- }
- }
-
- public byte[] processGzipEncoded(byte[] compressed, URL url)
- throws IOException {
-
- if (LOGGER.isTraceEnabled()) {
- LOGGER.trace("uncompressing....");
- }
-
- // content can be empty (i.e. redirection) in which case
- // there is nothing to unzip
- if (compressed.length == 0)
- return compressed;
-
- byte[] content;
- if (getMaxContent() >= 0) {
- content = GZIPUtils.unzipBestEffort(compressed, getMaxContent());
- } else {
- content = GZIPUtils.unzipBestEffort(compressed);
- }
-
- if (content == null)
- throw new IOException("unzipBestEffort returned null");
-
- if (LOGGER.isTraceEnabled()) {
- LOGGER.trace("fetched " + compressed.length
- + " bytes of compressed content (expanded to " + content.length
- + " bytes) from " + url);
- }
- return content;
- }
-
- public byte[] processDeflateEncoded(byte[] compressed, URL url)
- throws IOException {
-
- // content can be empty (i.e. redirection) in which case
- // there is nothing to deflate
- if (compressed.length == 0)
- return compressed;
-
- if (LOGGER.isTraceEnabled()) {
- LOGGER.trace("inflating....");
- }
-
- byte[] content;
- if (getMaxContent() >= 0) {
- content = DeflateUtils.inflateBestEffort(compressed, getMaxContent());
- } else {
- content = DeflateUtils.inflateBestEffort(compressed);
- }
-
- if (content == null)
- throw new IOException("inflateBestEffort returned null");
-
- if (LOGGER.isTraceEnabled()) {
- LOGGER.trace("fetched " + compressed.length
- + " bytes of compressed content (expanded to " + content.length
- + " bytes) from " + url);
- }
- return content;
- }
-
- protected static void main(HttpBase http, String[] args) throws Exception {
- boolean verbose = false;
- String url = null;
-
- String usage = "Usage: Http [-verbose] [-timeout N] url";
-
- if (args.length == 0) {
- System.err.println(usage);
- System.exit(-1);
- }
-
- for (int i = 0; i < args.length; i++) { // parse command line
- if (args[i].equals("-timeout")) { // found -timeout option
- http.timeout = Integer.parseInt(args[++i]) * 1000;
- } else if (args[i].equals("-verbose")) { // found -verbose option
- verbose = true;
- } else if (i != args.length - 1) {
- System.err.println(usage);
- System.exit(-1);
- } else
- // root is required parameter
- url = args[i];
- }
-
- // if (verbose) {
- // LOGGER.setLevel(Level.FINE);
- // }
-
- ProtocolOutput out = http
- .getProtocolOutput(new Text(url), new CrawlDatum());
- Content content = out.getContent();
-
- System.out.println("Status: " + out.getStatus());
- if (content != null) {
- System.out.println("Content Type: " + content.getContentType());
- System.out.println("Content Length: "
- + content.getMetadata().get(Response.CONTENT_LENGTH));
- System.out.println("Content:");
- String text = new String(content.getContent());
- System.out.println(text);
- }
- }
-
- protected abstract Response getResponse(URL url, CrawlDatum datum,
- boolean followRedirects) throws ProtocolException, IOException;
-
- public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
- return robots.getRobotRulesSet(this, url);
- }
-
- /**
- * Transforming a String[] into a HashMap for faster searching
- * @param input String[]
- * @return a new HashMap
- */
- private HashMap arrayToMap(String[]input){
- if (input==null ||input.length==0) {
- return new HashMap();
- }
- HashMap hm=new HashMap();
- for (int i=0;i<input.length;i++){
- if (!"".equals(input[i].trim())){
- hm.put(input[i],input[i]);
- }
- }
- return hm;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
deleted file mode 100644
index ff7ef5b..0000000
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.http.api;
-
-// Nutch imports
-import org.apache.nutch.protocol.ProtocolException;
-
-public class HttpException extends ProtocolException {
-
- public HttpException() {
- super();
- }
-
- public HttpException(String message) {
- super(message);
- }
-
- public HttpException(String message, Throwable cause) {
- super(message, cause);
- }
-
- public HttpException(Throwable cause) {
- super(cause);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
deleted file mode 100644
index 185ca15..0000000
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ /dev/null
@@ -1,167 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.http.api;
-
-import java.net.URL;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.RobotRulesParser;
-
-import crawlercommons.robots.BaseRobotRules;
-
-/**
- * This class is used for parsing robots for urls belonging to HTTP protocol. It
- * extends the generic {@link RobotRulesParser} class and contains Http protocol
- * specific implementation for obtaining the robots file.
- */
-public class HttpRobotRulesParser extends RobotRulesParser {
-
- public static final Logger LOG = LoggerFactory
- .getLogger(HttpRobotRulesParser.class);
- protected boolean allowForbidden = false;
-
- HttpRobotRulesParser() {
- }
-
- public HttpRobotRulesParser(Configuration conf) {
- setConf(conf);
- }
-
- public void setConf(Configuration conf) {
- super.setConf(conf);
- allowForbidden = conf.getBoolean("http.robots.403.allow", true);
- }
-
- /** Compose unique key to store and access robot rules in cache for given URL */
- protected static String getCacheKey(URL url) {
- String protocol = url.getProtocol().toLowerCase(); // normalize to lower
- // case
- String host = url.getHost().toLowerCase(); // normalize to lower case
- int port = url.getPort();
- if (port == -1) {
- port = url.getDefaultPort();
- }
- /*
- * Robot rules apply only to host, protocol, and port where robots.txt is
- * hosted (cf. NUTCH-1752). Consequently
- */
- String cacheKey = protocol + ":" + host + ":" + port;
- return cacheKey;
- }
-
- /**
- * Get the rules from robots.txt which applies for the given {@code url}.
- * Robot rules are cached for a unique combination of host, protocol, and
- * port. If no rules are found in the cache, a HTTP request is send to fetch
- * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
- * rules are cached to avoid re-fetching and re-parsing it again.
- *
- * @param http
- * The {@link Protocol} object
- * @param url
- * URL robots.txt applies to
- *
- * @return {@link BaseRobotRules} holding the rules from robots.txt
- */
- public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
-
- if (LOG.isTraceEnabled() && isWhiteListed(url)) {
- LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
- }
-
- String cacheKey = getCacheKey(url);
- BaseRobotRules robotRules = CACHE.get(cacheKey);
-
- if (robotRules != null) {
- return robotRules; // cached rule
- } else if (LOG.isTraceEnabled()) {
- LOG.trace("cache miss " + url);
- }
-
- boolean cacheRule = true;
- URL redir = null;
-
- if (isWhiteListed(url)) {
- // check in advance whether a host is whitelisted
- // (we do not need to fetch robots.txt)
- robotRules = EMPTY_RULES;
- LOG.info("Whitelisted host found for: {}", url);
- LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}",
- url.getHost());
-
- } else {
- try {
- Response response = ((HttpBase) http).getResponse(new URL(url,
- "/robots.txt"), new CrawlDatum(), true);
- // try one level of redirection ?
- if (response.getCode() == 301 || response.getCode() == 302) {
- String redirection = response.getHeader("Location");
- if (redirection == null) {
- // some versions of MS IIS are known to mangle this header
- redirection = response.getHeader("location");
- }
- if (redirection != null) {
- if (!redirection.startsWith("http")) {
- // RFC says it should be absolute, but apparently it isn't
- redir = new URL(url, redirection);
- } else {
- redir = new URL(redirection);
- }
-
- response = ((HttpBase) http).getResponse(redir, new CrawlDatum(),
- true);
- }
- }
-
- if (response.getCode() == 200) // found rules: parse them
- robotRules = parseRules(url.toString(), response.getContent(),
- response.getHeader("Content-Type"), agentNames);
-
- else if ((response.getCode() == 403) && (!allowForbidden))
- robotRules = FORBID_ALL_RULES; // use forbid all
- else if (response.getCode() >= 500) {
- cacheRule = false; // try again later to fetch robots.txt
- robotRules = EMPTY_RULES;
- } else
- robotRules = EMPTY_RULES; // use default rules
- } catch (Throwable t) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
- }
- cacheRule = false; // try again later to fetch robots.txt
- robotRules = EMPTY_RULES;
- }
- }
-
- if (cacheRule) {
- CACHE.put(cacheKey, robotRules); // cache rules for host
- if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
- // cache also for the redirected host
- CACHE.put(getCacheKey(redir), robotRules);
- }
- }
-
- return robotRules;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html
deleted file mode 100644
index 972bb3c..0000000
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html
+++ /dev/null
@@ -1,6 +0,0 @@
-<html>
-<body>
-<p>Common API used by HTTP plugins ({@link org.apache.nutch.protocol.http http},
-{@link org.apache.nutch.protocol.httpclient httpclient})</p>
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
deleted file mode 100644
index 23e4ef6..0000000
--- a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.http.api;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-import crawlercommons.robots.BaseRobotRules;
-
-/**
- * JUnit test case which tests 1. that robots filtering is performed correctly
- * as per the agent name 2. that crawl delay is extracted correctly from the
- * robots file
- *
- */
-public class TestRobotRulesParser {
-
- private static final String CONTENT_TYPE = "text/plain";
- private static final String SINGLE_AGENT = "Agent1";
- private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
- private static final String UNKNOWN_AGENT = "AgentABC";
- private static final String CR = "\r";
-
- private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR
- + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c"
- + CR
- + "Crawl-delay: 10"
- + CR // set crawl delay for Agent1 as 10 sec
- + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh"
- + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20"
- + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no
- // crawl
- // delay
- // for
- // other
- // agents
-
- private static final String[] TEST_PATHS = new String[] {
- "http://example.com/a", "http://example.com/a/bloh/foo.html",
- "http://example.com/b", "http://example.com/c",
- "http://example.com/b/a/index.html",
- "http://example.com/foo/bar/baz.html" };
-
- private static final boolean[] RESULTS = new boolean[] { false, // /a
- false, // /a/bloh/foo.html
- true, // /b
- true, // /c
- false, // /b/a/index.html
- true // /foo/bar/baz.html
- };
-
- private HttpRobotRulesParser parser;
- private BaseRobotRules rules;
-
- public TestRobotRulesParser() {
- parser = new HttpRobotRulesParser();
- }
-
- /**
- * Test that the robots rules are interpreted correctly by the robots rules
- * parser.
- */
- @Test
- public void testRobotsAgent() {
- rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
- CONTENT_TYPE, SINGLE_AGENT);
-
- for (int counter = 0; counter < TEST_PATHS.length; counter++) {
- Assert.assertTrue(
- "testing on agent (" + SINGLE_AGENT + "), and " + "path "
- + TEST_PATHS[counter] + " got "
- + rules.isAllowed(TEST_PATHS[counter]),
- rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
- }
-
- rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
- CONTENT_TYPE, MULTIPLE_AGENTS);
-
- for (int counter = 0; counter < TEST_PATHS.length; counter++) {
- Assert.assertTrue(
- "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
- + TEST_PATHS[counter] + " got "
- + rules.isAllowed(TEST_PATHS[counter]),
- rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
- }
- }
-
- /**
- * Test that the crawl delay is extracted from the robots file for respective
- * agent. If its not specified for a given agent, default value must be
- * returned.
- */
- @Test
- public void testCrawlDelay() {
- // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
- // returned by the parser
- rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
- CONTENT_TYPE, SINGLE_AGENT);
- Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
- (rules.getCrawlDelay() == 10000));
-
- // for UNKNOWN_AGENT, the default crawl delay must be returned.
- rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
- CONTENT_TYPE, UNKNOWN_AGENT);
- Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
- (rules.getCrawlDelay() == Long.MIN_VALUE));
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-nekohtml/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-nekohtml/build.xml b/src/plugin/lib-nekohtml/build.xml
deleted file mode 100644
index 4bca1af..0000000
--- a/src/plugin/lib-nekohtml/build.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="lib-nekohtml" default="jar">
-
- <import file="../build-plugin.xml"/>
-
- <!--
- ! Override the compile and jar targets,
- ! since there is nothing to compile here.
- ! -->
- <target name="compile" depends="init, resolve-default"/>
-
- <target name="jar" depends="compile"/>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-nekohtml/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-nekohtml/ivy.xml b/src/plugin/lib-nekohtml/ivy.xml
deleted file mode 100644
index ed70b80..0000000
--- a/src/plugin/lib-nekohtml/ivy.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.19" conf="*->master"/>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-nekohtml/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-nekohtml/plugin.xml b/src/plugin/lib-nekohtml/plugin.xml
deleted file mode 100644
index 513c9a7..0000000
--- a/src/plugin/lib-nekohtml/plugin.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<!--
- ! NekoHTML is a simple HTML scanner and tag balancer that enables
- ! application programmers to parse HTML documents and access the
- ! information using standard XML interfaces.
- ! (http://sourceforge.net/projects/nekohtml/)
- !
- ! License : https://nekohtml.svn.sourceforge.net/svnroot/nekohtml/trunk/LICENSE.txt
- !-->
-<plugin
- id="lib-nekohtml"
- name="CyberNeko HTML Parser"
- version="1.9.19"
- provider-name="net.sourceforge.nekohtml">
-
- <runtime>
- <library name="nekohtml-1.9.19.jar">
- <export name="*"/>
- </library>
- </runtime>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-regex-filter/build.xml b/src/plugin/lib-regex-filter/build.xml
deleted file mode 100644
index 9702ca2..0000000
--- a/src/plugin/lib-regex-filter/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="lib-regex-filter" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-regex-filter/ivy.xml b/src/plugin/lib-regex-filter/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/lib-regex-filter/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-regex-filter/plugin.xml b/src/plugin/lib-regex-filter/plugin.xml
deleted file mode 100644
index 42de8f1..0000000
--- a/src/plugin/lib-regex-filter/plugin.xml
+++ /dev/null
@@ -1,33 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<!--
- ! A common framework for RegExp based URL filters
- !-->
-<plugin
- id="lib-regex-filter"
- name="Regex URL Filter Framework"
- version="1.0"
- provider-name="org.apache.nutch">
-
- <runtime>
- <library name="lib-regex-filter.jar">
- <export name="*"/>
- </library>
- </runtime>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
deleted file mode 100644
index e408586..0000000
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.urlfilter.api;
-
-/**
- * A generic regular expression rule.
- *
- * @author Jérôme Charron
- */
-public abstract class RegexRule {
-
- private final boolean sign;
-
- private final String hostOrDomain;
-
- private final String regex;
-
- /**
- * Constructs a new regular expression rule.
- *
- * @param sign
- * specifies if this rule must filter-in or filter-out. A
- * <code>true</code> value means that any url matching this rule must
- * be accepted, a <code>false</code> value means that any url
- * matching this rule must be rejected.
- * @param regex
- * is the regular expression used for matching (see
- * {@link #match(String)} method).
- */
- protected RegexRule(boolean sign, String regex) {
- this(sign, regex, null);
- }
-
- /**
- * Constructs a new regular expression rule.
- *
- * @param sign
- * specifies if this rule must filter-in or filter-out. A
- * <code>true</code> value means that any url matching this rule must
- * be accepted, a <code>false</code> value means that any url
- * matching this rule must be rejected.
- * @param regex
- * is the regular expression used for matching (see
- * {@link #match(String)} method).
- * @param hostOrDomain
- * the host or domain to which this regex belongs
- */
- protected RegexRule(boolean sign, String regex, String hostOrDomain) {
- this.sign = sign;
- this.hostOrDomain = hostOrDomain;
- this.regex = regex;
- }
-
- /**
- * Return if this rule is used for filtering-in or out.
- *
- * @return <code>true</code> if any url matching this rule must be accepted,
- * otherwise <code>false</code>.
- */
- protected boolean accept() {
- return sign;
- }
-
- /**
- * Return if this rule is used for filtering-in or out.
- *
- * @return host or domain this regex rule belongs to
- */
- protected String hostOrDomain() { return hostOrDomain; }
-
- /**
- * Return if this rule's regex.
- *
- * @return this regex
- */
- protected String regex() { return regex; }
-
- /**
- * Checks if a url matches this rule.
- *
- * @param url
- * is the url to check.
- * @return <code>true</code> if the specified url matches this rule, otherwise
- * <code>false</code>.
- */
- protected abstract boolean match(String url);
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
deleted file mode 100644
index f5cc081..0000000
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++ /dev/null
@@ -1,315 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.urlfilter.api;
-
-// JDK imports
-import java.io.File;
-import java.io.Reader;
-import java.io.FileReader;
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.io.IOException;
-import java.io.StringReader;
-import java.net.MalformedURLException;
-import java.util.List;
-import java.util.ArrayList;
-
-// Commons Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-// Nutch imports
-import org.apache.nutch.net.*;
-import org.apache.nutch.util.URLUtil;
-
-/**
- * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
- * expressions.
- *
- * <p>
- * The regular expressions rules are expressed in a file. The file of rules is
- * determined for each implementation using the
- * {@link #getRulesReader(Configuration conf)} method.
- * </p>
- *
- * <p>
- * The format of this file is made of many rules (one per line):<br/>
- * <code>
- * [+-]<regex>
- * </code><br/>
- * where plus (<code>+</code>)means go ahead and index it and minus (
- * <code>-</code>)means no.
- * </p>
- *
- * @author Jérôme Charron
- */
-public abstract class RegexURLFilterBase implements URLFilter {
-
- /** My logger */
- private final static Logger LOG = LoggerFactory
- .getLogger(RegexURLFilterBase.class);
-
- /** An array of applicable rules */
- private List<RegexRule> rules;
-
- /** The current configuration */
- private Configuration conf;
-
- /**
- * Constructs a new empty RegexURLFilterBase
- */
- public RegexURLFilterBase() {
- }
-
- /**
- * Constructs a new RegexURLFilter and init it with a file of rules.
- *
- * @param filename
- * is the name of rules file.
- */
- public RegexURLFilterBase(File filename) throws IOException,
- IllegalArgumentException {
- this(new FileReader(filename));
- }
-
- /**
- * Constructs a new RegexURLFilter and inits it with a list of rules.
- *
- * @param rules
- * string with a list of rules, one rule per line
- * @throws IOException
- * @throws IllegalArgumentException
- */
- public RegexURLFilterBase(String rules) throws IOException,
- IllegalArgumentException {
- this(new StringReader(rules));
- }
-
- /**
- * Constructs a new RegexURLFilter and init it with a Reader of rules.
- *
- * @param reader
- * is a reader of rules.
- */
- protected RegexURLFilterBase(Reader reader) throws IOException,
- IllegalArgumentException {
- rules = readRules(reader);
- }
-
- /**
- * Creates a new {@link RegexRule}.
- *
- * @param sign
- * of the regular expression. A <code>true</code> value means that
- * any URL matching this rule must be included, whereas a
- * <code>false</code> value means that any URL matching this rule
- * must be excluded.
- * @param regex
- * is the regular expression associated to this rule.
- */
- protected abstract RegexRule createRule(boolean sign, String regex);
-
- /**
- * Creates a new {@link RegexRule}.
- * @param
- * sign of the regular expression.
- * A <code>true</code> value means that any URL matching this rule
- * must be included, whereas a <code>false</code>
- * value means that any URL matching this rule must be excluded.
- * @param regex
- * is the regular expression associated to this rule.
- * @param hostOrDomain
- * the host or domain to which this regex belongs
- */
- protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain);
-
- /**
- * Returns the name of the file of rules to use for a particular
- * implementation.
- *
- * @param conf
- * is the current configuration.
- * @return the name of the resource containing the rules to use.
- */
- protected abstract Reader getRulesReader(Configuration conf)
- throws IOException;
-
- /*
- * -------------------------- * <implementation:URLFilter> *
- * --------------------------
- */
-
- // Inherited Javadoc
- public String filter(String url) {
- String host = URLUtil.getHost(url);
- String domain = null;
-
- try {
- domain = URLUtil.getDomainName(url);
- } catch (MalformedURLException e) {
- // shouldnt happen here right?
- }
-
- if (LOG.isDebugEnabled()) {
- LOG.debug("URL belongs to host " + host + " and domain " + domain);
- }
-
- for (RegexRule rule : rules) {
- // Skip the skip for rules that don't share the same host and domain
- if (rule.hostOrDomain() != null &&
- !rule.hostOrDomain().equals(host) &&
- !rule.hostOrDomain().equals(domain)) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain());
- }
-
- continue;
- }
-
- if (LOG.isDebugEnabled()) {
- LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain);
- }
-
- if (rule.match(url)) {
- return rule.accept() ? url : null;
- }
- }
- ;
- return null;
- }
-
- /*
- * --------------------------- * </implementation:URLFilter> *
- * ---------------------------
- */
-
- /*
- * ----------------------------- * <implementation:Configurable> *
- * -----------------------------
- */
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- Reader reader = null;
- try {
- reader = getRulesReader(conf);
- } catch (Exception e) {
- if (LOG.isErrorEnabled()) {
- LOG.error(e.getMessage());
- }
- throw new RuntimeException(e.getMessage(), e);
- }
- try {
- rules = readRules(reader);
- } catch (IOException e) {
- if (LOG.isErrorEnabled()) {
- LOG.error(e.getMessage());
- }
- throw new RuntimeException(e.getMessage(), e);
- }
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-
- /*
- * ------------------------------ * </implementation:Configurable> *
- * ------------------------------
- */
-
- /**
- * Read the specified file of rules.
- *
- * @param reader
- * is a reader of regular expressions rules.
- * @return the corresponding {@RegexRule rules}.
- */
- private List<RegexRule> readRules(Reader reader) throws IOException,
- IllegalArgumentException {
-
- BufferedReader in = new BufferedReader(reader);
- List<RegexRule> rules = new ArrayList<RegexRule>();
- String line;
- String hostOrDomain = null;
-
- while ((line = in.readLine()) != null) {
- if (line.length() == 0) {
- continue;
- }
- char first = line.charAt(0);
- boolean sign = false;
- switch (first) {
- case '+':
- sign = true;
- break;
- case '-':
- sign = false;
- break;
- case ' ':
- case '\n':
- case '#': // skip blank & comment lines
- continue;
- case '>':
- hostOrDomain = line.substring(1).trim();
- continue;
- case '<':
- hostOrDomain = null;
- continue;
- default:
- throw new IOException("Invalid first character: " + line);
- }
-
- String regex = line.substring(1);
- if (LOG.isTraceEnabled()) {
- LOG.trace("Adding rule [" + regex + "] for " + hostOrDomain);
- }
- RegexRule rule = createRule(sign, regex, hostOrDomain);
- rules.add(rule);
- }
- return rules;
- }
-
- /**
- * Filter the standard input using a RegexURLFilterBase.
- *
- * @param filter
- * is the RegexURLFilterBase to use for filtering the standard input.
- * @param args
- * some optional parameters (not used).
- */
- public static void main(RegexURLFilterBase filter, String args[])
- throws IOException, IllegalArgumentException {
-
- BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
- String line;
- while ((line = in.readLine()) != null) {
- String out = filter.filter(line);
- if (out != null) {
- System.out.print("+");
- System.out.println(out);
- } else {
- System.out.print("-");
- System.out.println(line);
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/package-info.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/package-info.java
deleted file mode 100644
index b849353..0000000
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Generic {@link org.apache.nutch.net.URLFilter URL filter} library,
- * abstracting away from regular expression implementations.
- */
-package org.apache.nutch.urlfilter.api;
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
deleted file mode 100644
index 0b58231..0000000
--- a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
+++ /dev/null
@@ -1,134 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.urlfilter.api;
-
-// JDK imports
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.Reader;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.junit.Assert;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-// Nutch imports
-import org.apache.nutch.net.URLFilter;
-
-/**
- * JUnit based test of class <code>RegexURLFilterBase</code>.
- *
- * @author Jérôme Charron
- */
-public abstract class RegexURLFilterBaseTest {
-
- /** My logger */
- protected static final Logger LOG = LoggerFactory
- .getLogger(RegexURLFilterBaseTest.class);
-
- private final static String SEPARATOR = System.getProperty("file.separator");
- private final static String SAMPLES = System.getProperty("test.data", ".");
-
- protected abstract URLFilter getURLFilter(Reader rules);
-
- protected void bench(int loops, String file) {
- try {
- bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
- new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
- } catch (Exception e) {
- Assert.fail(e.toString());
- }
- }
-
- protected void bench(int loops, Reader rules, Reader urls) {
- long start = System.currentTimeMillis();
- try {
- URLFilter filter = getURLFilter(rules);
- FilteredURL[] expected = readURLFile(urls);
- for (int i = 0; i < loops; i++) {
- test(filter, expected);
- }
- } catch (Exception e) {
- Assert.fail(e.toString());
- }
- LOG.info("bench time (" + loops + ") "
- + (System.currentTimeMillis() - start) + "ms");
- }
-
- protected void test(String file) {
- try {
- test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
- new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
- } catch (Exception e) {
- Assert.fail(e.toString());
- }
- }
-
- protected void test(Reader rules, Reader urls) {
- try {
- test(getURLFilter(rules), readURLFile(urls));
- } catch (Exception e) {
- Assert.fail(e.toString());
- }
- }
-
- protected void test(URLFilter filter, FilteredURL[] expected) {
- for (int i = 0; i < expected.length; i++) {
- String result = filter.filter(expected[i].url);
- if (result != null) {
- Assert.assertTrue(expected[i].url, expected[i].sign);
- } else {
- Assert.assertFalse(expected[i].url, expected[i].sign);
- }
- }
- }
-
- private static FilteredURL[] readURLFile(Reader reader) throws IOException {
- BufferedReader in = new BufferedReader(reader);
- List<FilteredURL> list = new ArrayList<FilteredURL>();
- String line;
- while ((line = in.readLine()) != null) {
- if (line.length() != 0) {
- list.add(new FilteredURL(line));
- }
- }
- return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]);
- }
-
- private static class FilteredURL {
-
- boolean sign;
- String url;
-
- FilteredURL(String line) {
- switch (line.charAt(0)) {
- case '+':
- sign = true;
- break;
- case '-':
- sign = false;
- break;
- default:
- // Simply ignore...
- }
- url = line.substring(1);
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/build-ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-selenium/build-ivy.xml b/src/plugin/lib-selenium/build-ivy.xml
deleted file mode 100644
index 3abcf6d..0000000
--- a/src/plugin/lib-selenium/build-ivy.xml
+++ /dev/null
@@ -1,54 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="lib-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
-
- <property name="ivy.install.version" value="2.1.0" />
- <condition property="ivy.home" value="${env.IVY_HOME}">
- <isset property="env.IVY_HOME" />
- </condition>
- <property name="ivy.home" value="${user.home}/.ant" />
- <property name="ivy.checksums" value="" />
- <property name="ivy.jar.dir" value="${ivy.home}/lib" />
- <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
-
- <target name="download-ivy" unless="offline">
-
- <mkdir dir="${ivy.jar.dir}"/>
- <!-- download Ivy from web site so that it can be used even without any special installation -->
- <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
- dest="${ivy.jar.file}" usetimestamp="true"/>
- </target>
-
- <target name="init-ivy" depends="download-ivy">
- <!-- try to load ivy here from ivy home, in case the user has not already dropped
- it into ant's lib dir (note that the latter copy will always take precedence).
- We will not fail as long as local lib dir exists (it may be empty) and
- ivy is in at least one of ant's lib dir or the local lib dir. -->
- <path id="ivy.lib.path">
- <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
-
- </path>
- <taskdef resource="org/apache/ivy/ant/antlib.xml"
- uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
- </target>
-
- <target name="deps-jar" depends="init-ivy">
- <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
- </target>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-selenium/build.xml b/src/plugin/lib-selenium/build.xml
deleted file mode 100644
index 7c6d98d..0000000
--- a/src/plugin/lib-selenium/build.xml
+++ /dev/null
@@ -1,28 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="lib-selenium" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Add compilation dependencies to classpath -->
- <path id="plugin.deps">
- <fileset dir="${nutch.root}/build">
- <include name="**/lib-http/*.jar" />
- </fileset>
- </path>
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/howto_upgrade_selenium.txt
----------------------------------------------------------------------
diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.txt b/src/plugin/lib-selenium/howto_upgrade_selenium.txt
deleted file mode 100644
index 1892a62..0000000
--- a/src/plugin/lib-selenium/howto_upgrade_selenium.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml
-
-2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
-
- To get a list of dependencies and their versions execute:
- $ ant -f ./build-ivy.xml
- $ ls lib | sed 's/^/ <library name="/g' | sed 's/$/">\n <export name="*"\/>\n <\/library>/g'
-
- Note that all dependent libraries are exported for a "library" plugin ("lib-selenium").
-
- N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows
-
- $ brew install gnu-sed --with-default-names
-
- You can then restart your terminal and the Regex + Sed command should work just fine!
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml
deleted file mode 100644
index 701b725..0000000
--- a/src/plugin/lib-selenium/ivy.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../../ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- <!-- begin selenium dependencies -->
- <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.48.2" />
-
- <dependency org="com.opera" name="operadriver" rev="1.5">
- <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
- </dependency>
- <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
- <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
- <exclude org="org.seleniumhq.selenium" name="selenium-java" />
- </dependency>
- <!-- end selenium dependencies -->
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-selenium/plugin.xml b/src/plugin/lib-selenium/plugin.xml
deleted file mode 100644
index a86d665..0000000
--- a/src/plugin/lib-selenium/plugin.xml
+++ /dev/null
@@ -1,175 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<!--
- ! A common framework for http protocol implementations
- !-->
-<plugin
- id="lib-selenium"
- name="HTTP Framework"
- version="1.0"
- provider-name="org.apache.nutch">
-
- <runtime>
- <library name="lib-selenium.jar">
- <export name="*"/>
- </library>
- <!-- all classes from dependent libraries are exported -->
- <library name="cglib-nodep-2.1_3.jar">
- <export name="*"/>
- </library>
- <library name="commons-codec-1.10.jar">
- <export name="*"/>
- </library>
- <library name="commons-collections-3.2.1.jar">
- <export name="*"/>
- </library>
- <library name="commons-exec-1.3.jar">
- <export name="*"/>
- </library>
- <library name="commons-io-2.4.jar">
- <export name="*"/>
- </library>
- <library name="commons-jxpath-1.3.jar">
- <export name="*"/>
- </library>
- <library name="commons-lang3-3.4.jar">
- <export name="*"/>
- </library>
- <library name="commons-logging-1.2.jar">
- <export name="*"/>
- </library>
- <library name="cssparser-0.9.16.jar">
- <export name="*"/>
- </library>
- <library name="gson-2.3.1.jar">
- <export name="*"/>
- </library>
- <library name="guava-18.0.jar">
- <export name="*"/>
- </library>
- <library name="htmlunit-2.18.jar">
- <export name="*"/>
- </library>
- <library name="htmlunit-core-js-2.17.jar">
- <export name="*"/>
- </library>
- <library name="httpclient-4.5.1.jar">
- <export name="*"/>
- </library>
- <library name="httpcore-4.4.3.jar">
- <export name="*"/>
- </library>
- <library name="httpmime-4.5.jar">
- <export name="*"/>
- </library>
- <library name="ini4j-0.5.2.jar">
- <export name="*"/>
- </library>
- <library name="jetty-io-9.2.12.v20150709.jar">
- <export name="*"/>
- </library>
- <library name="jetty-util-9.2.12.v20150709.jar">
- <export name="*"/>
- </library>
- <library name="jna-4.1.0.jar">
- <export name="*"/>
- </library>
- <library name="jna-platform-4.1.0.jar">
- <export name="*"/>
- </library>
- <library name="nekohtml-1.9.22.jar">
- <export name="*"/>
- </library>
- <library name="netty-3.5.2.Final.jar">
- <export name="*"/>
- </library>
- <library name="operadriver-1.5.jar">
- <export name="*"/>
- </library>
- <library name="operalaunchers-1.1.jar">
- <export name="*"/>
- </library>
- <library name="phantomjsdriver-1.2.1.jar">
- <export name="*"/>
- </library>
- <library name="protobuf-java-2.4.1.jar">
- <export name="*"/>
- </library>
- <library name="sac-1.3.jar">
- <export name="*"/>
- </library>
- <library name="selenium-api-2.48.2.jar">
- <export name="*"/>
- </library>
- <library name="selenium-chrome-driver-2.48.2.jar">
- <export name="*"/>
- </library>
- <library name="selenium-edge-driver-2.48.2.jar">
- <export name="*"/>
- </library>
- <library name="selenium-firefox-driver-2.48.2.jar">
- <export name="*"/>
- </library>
- <library name="selenium-htmlunit-driver-2.48.2.jar">
- <export name="*"/>
- </library>
- <library name="selenium-ie-driver-2.48.2.jar">
- <export name="*"/>
- </library>
- <library name="selenium-java-2.48.2.jar">
- <export name="*"/>
- </library>
- <library name="selenium-leg-rc-2.48.2.jar">
- <export name="*"/>
- </library>
- <library name="selenium-remote-driver-2.48.2.jar">
- <export name="*"/>
- </library>
- <library name="selenium-safari-driver-2.48.2.jar">
- <export name="*"/>
- </library>
- <library name="selenium-support-2.48.2.jar">
- <export name="*"/>
- </library>
- <library name="serializer-2.7.2.jar">
- <export name="*"/>
- </library>
- <library name="webbit-0.4.14.jar">
- <export name="*"/>
- </library>
- <library name="websocket-api-9.2.12.v20150709.jar">
- <export name="*"/>
- </library>
- <library name="websocket-client-9.2.12.v20150709.jar">
- <export name="*"/>
- </library>
- <library name="websocket-common-9.2.12.v20150709.jar">
- <export name="*"/>
- </library>
- <library name="xalan-2.7.2.jar">
- <export name="*"/>
- </library>
- <library name="xercesImpl-2.11.0.jar">
- <export name="*"/>
- </library>
- <library name="xml-apis-1.4.01.jar">
- <export name="*"/>
- </library>
- </runtime>
-
-</plugin>