You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC
svn commit: r1655526 [15/26] - in /nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...
Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Thu Jan 29 05:38:59 2015
@@ -47,20 +47,20 @@ import org.apache.hadoop.io.Text;
import crawlercommons.robots.BaseRobotRules;
public abstract class HttpBase implements Protocol {
-
+
public static final Text RESPONSE_TIME = new Text("_rs_");
public static final int BUFFER_SIZE = 8 * 1024;
-
+
private static final byte[] EMPTY_CONTENT = new byte[0];
private HttpRobotRulesParser robots = null;
-
- /** The proxy hostname. */
+
+ /** The proxy hostname. */
protected String proxyHost = null;
/** The proxy port. */
- protected int proxyPort = 8080;
+ protected int proxyPort = 8080;
/** Indicates if a proxy is used */
protected boolean useProxy = false;
@@ -69,29 +69,27 @@ public abstract class HttpBase implement
protected int timeout = 10000;
/** The length limit for downloaded content, in bytes. */
- protected int maxContent = 64 * 1024;
+ protected int maxContent = 64 * 1024;
/** The Nutch 'User-Agent' request header */
- protected String userAgent = getAgentString(
- "NutchCVS", null, "Nutch",
- "http://nutch.apache.org/bot.html",
- "agent@nutch.apache.org");
+ protected String userAgent = getAgentString("NutchCVS", null, "Nutch",
+ "http://nutch.apache.org/bot.html", "agent@nutch.apache.org");
/** The "Accept-Language" request header value. */
protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
/** The "Accept" request header value. */
protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
-
+
/** The default logger */
private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
/** The specified logger */
private Logger logger = LOGGER;
-
+
/** The nutch configuration */
private Configuration conf = null;
-
+
/** Do we use HTTP/1.1? */
protected boolean useHttp11 = false;
@@ -99,14 +97,14 @@ public abstract class HttpBase implement
* Record response time in CrawlDatum's meta data, see property
* http.store.responsetime.
*/
- protected boolean responseTime = true;
-
+ protected boolean responseTime = true;
+
/** Skip page if Crawl-Delay longer than this value. */
protected long maxCrawlDelay = -1L;
-
+
/** Which TLS/SSL protocols to support */
protected Set<String> tlsPreferredProtocols;
-
+
/** Which TLS/SSL cipher suites to support */
protected Set<String> tlsPreferredCipherSuites;
@@ -114,7 +112,7 @@ public abstract class HttpBase implement
public HttpBase() {
this(null);
}
-
+
/** Creates a new instance of HttpBase */
public HttpBase(Logger logger) {
if (logger != null) {
@@ -122,134 +120,168 @@ public abstract class HttpBase implement
}
robots = new HttpRobotRulesParser();
}
-
+
// Inherited Javadoc
public void setConf(Configuration conf) {
- this.conf = conf;
- this.proxyHost = conf.get("http.proxy.host");
- this.proxyPort = conf.getInt("http.proxy.port", 8080);
- this.useProxy = (proxyHost != null && proxyHost.length() > 0);
- this.timeout = conf.getInt("http.timeout", 10000);
- this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
- this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf
- .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
- this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
- this.accept = conf.get("http.accept", accept);
- // backward-compatible default setting
- this.useHttp11 = conf.getBoolean("http.useHttp11", false);
- this.responseTime = conf.getBoolean("http.store.responsetime", true);
- this.robots.setConf(conf);
-
- String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
- String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
- "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
- "TLS_RSA_WITH_AES_256_CBC_SHA256","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
- "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256","TLS_DHE_DSS_WITH_AES_256_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
- "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA","TLS_RSA_WITH_AES_256_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
- "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
- "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256","TLS_RSA_WITH_AES_128_CBC_SHA256",
- "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256","TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
- "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA",
- "TLS_RSA_WITH_AES_128_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
- "TLS_DHE_RSA_WITH_AES_128_CBC_SHA","TLS_DHE_DSS_WITH_AES_128_CBC_SHA","TLS_ECDHE_ECDSA_WITH_RC4_128_SHA",
- "TLS_ECDHE_RSA_WITH_RC4_128_SHA","SSL_RSA_WITH_RC4_128_SHA","TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
- "TLS_ECDH_RSA_WITH_RC4_128_SHA","TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA",
- "SSL_RSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
- "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA","SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA","SSL_RSA_WITH_RC4_128_MD5",
- "TLS_EMPTY_RENEGOTIATION_INFO_SCSV","TLS_RSA_WITH_NULL_SHA256","TLS_ECDHE_ECDSA_WITH_NULL_SHA",
- "TLS_ECDHE_RSA_WITH_NULL_SHA","SSL_RSA_WITH_NULL_SHA","TLS_ECDH_ECDSA_WITH_NULL_SHA","TLS_ECDH_RSA_WITH_NULL_SHA",
- "SSL_RSA_WITH_NULL_MD5","SSL_RSA_WITH_DES_CBC_SHA","SSL_DHE_RSA_WITH_DES_CBC_SHA","SSL_DHE_DSS_WITH_DES_CBC_SHA",
- "TLS_KRB5_WITH_RC4_128_SHA","TLS_KRB5_WITH_RC4_128_MD5","TLS_KRB5_WITH_3DES_EDE_CBC_SHA","TLS_KRB5_WITH_3DES_EDE_CBC_MD5",
- "TLS_KRB5_WITH_DES_CBC_SHA","TLS_KRB5_WITH_DES_CBC_MD5");
+ this.conf = conf;
+ this.proxyHost = conf.get("http.proxy.host");
+ this.proxyPort = conf.getInt("http.proxy.port", 8080);
+ this.useProxy = (proxyHost != null && proxyHost.length() > 0);
+ this.timeout = conf.getInt("http.timeout", 10000);
+ this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
+ this.userAgent = getAgentString(conf.get("http.agent.name"),
+ conf.get("http.agent.version"), conf.get("http.agent.description"),
+ conf.get("http.agent.url"), conf.get("http.agent.email"));
+ this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
+ this.accept = conf.get("http.accept", accept);
+ // backward-compatible default setting
+ this.useHttp11 = conf.getBoolean("http.useHttp11", false);
+ this.responseTime = conf.getBoolean("http.store.responsetime", true);
+ this.robots.setConf(conf);
+
+ String[] protocols = conf.getStrings("http.tls.supported.protocols",
+ "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
+ String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
+ "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
+ "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
+ "TLS_RSA_WITH_AES_256_CBC_SHA256",
+ "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
+ "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
+ "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
+ "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256",
+ "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
+ "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
+ "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
+ "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA",
+ "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
+ "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
+ "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
+ "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA",
+ "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
+ "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA",
+ "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA",
+ "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
+ "TLS_ECDH_RSA_WITH_RC4_128_SHA",
+ "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA",
+ "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA",
+ "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
+ "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
+ "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA",
+ "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5",
+ "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256",
+ "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA",
+ "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA",
+ "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5",
+ "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA",
+ "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
+ "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA",
+ "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA",
+ "TLS_KRB5_WITH_DES_CBC_MD5");
- tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
- tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
+ tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
+ tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
- logConf();
+ logConf();
}
// Inherited Javadoc
public Configuration getConf() {
return this.conf;
}
-
+
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
-
+
String urlString = url.toString();
try {
URL u = new URL(urlString);
-
+
long startTime = System.currentTimeMillis();
Response response = getResponse(u, datum, false); // make a request
-
- if(this.responseTime) {
+
+ if (this.responseTime) {
int elapsedTime = (int) (System.currentTimeMillis() - startTime);
datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
}
-
+
int code = response.getCode();
byte[] content = response.getContent();
Content c = new Content(u.toString(), u.toString(),
- (content == null ? EMPTY_CONTENT : content),
- response.getHeader("Content-Type"),
- response.getHeaders(), this.conf);
-
+ (content == null ? EMPTY_CONTENT : content),
+ response.getHeader("Content-Type"), response.getHeaders(), this.conf);
+
if (code == 200) { // got a good response
return new ProtocolOutput(c); // return it
-
+
} else if (code >= 300 && code < 400) { // handle redirect
String location = response.getHeader("Location");
// some broken servers, such as MS IIS, use lowercase header name...
- if (location == null) location = response.getHeader("location");
- if (location == null) location = "";
+ if (location == null)
+ location = response.getHeader("location");
+ if (location == null)
+ location = "";
u = new URL(u, location);
int protocolStatusCode;
switch (code) {
- case 300: // multiple choices, preferred value in Location
- protocolStatusCode = ProtocolStatus.MOVED;
- break;
- case 301: // moved permanently
- case 305: // use proxy (Location is URL of proxy)
- protocolStatusCode = ProtocolStatus.MOVED;
- break;
- case 302: // found (temporarily moved)
- case 303: // see other (redirect after POST)
- case 307: // temporary redirect
- protocolStatusCode = ProtocolStatus.TEMP_MOVED;
- break;
- case 304: // not modified
- protocolStatusCode = ProtocolStatus.NOTMODIFIED;
- break;
- default:
- protocolStatusCode = ProtocolStatus.MOVED;
+ case 300: // multiple choices, preferred value in Location
+ protocolStatusCode = ProtocolStatus.MOVED;
+ break;
+ case 301: // moved permanently
+ case 305: // use proxy (Location is URL of proxy)
+ protocolStatusCode = ProtocolStatus.MOVED;
+ break;
+ case 302: // found (temporarily moved)
+ case 303: // see other (redirect after POST)
+ case 307: // temporary redirect
+ protocolStatusCode = ProtocolStatus.TEMP_MOVED;
+ break;
+ case 304: // not modified
+ protocolStatusCode = ProtocolStatus.NOTMODIFIED;
+ break;
+ default:
+ protocolStatusCode = ProtocolStatus.MOVED;
}
// handle this in the higher layer.
return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
} else if (code == 400) { // bad request, mark as GONE
- if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); }
+ if (logger.isTraceEnabled()) {
+ logger.trace("400 Bad request: " + u);
+ }
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
- } else if (code == 401) { // requires authorization, but no valid auth provided.
- if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); }
- return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+ } else if (code == 401) { // requires authorization, but no valid auth
+ // provided.
+ if (logger.isTraceEnabled()) {
+ logger.trace("401 Authentication Required");
+ }
+ return new ProtocolOutput(c, new ProtocolStatus(
+ ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+ urlString));
} else if (code == 404) {
- return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
+ return new ProtocolOutput(c, new ProtocolStatus(
+ ProtocolStatus.NOTFOUND, u));
} else if (code == 410) { // permanently GONE
- return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + u));
+ return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
+ "Http: " + code + " url=" + u));
} else {
- return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
- + u));
+ return new ProtocolOutput(c, new ProtocolStatus(
+ ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
}
} catch (Throwable e) {
logger.error("Failed to get protocol output", e);
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
-
- /* -------------------------- *
- * </implementation:Protocol> *
- * -------------------------- */
+ /*
+ * -------------------------- * </implementation:Protocol> *
+ * --------------------------
+ */
public String getProxyHost() {
return proxyHost;
@@ -274,70 +306,69 @@ public abstract class HttpBase implement
public String getUserAgent() {
return userAgent;
}
-
- /** Value of "Accept-Language" request header sent by Nutch.
+
+ /**
+ * Value of "Accept-Language" request header sent by Nutch.
+ *
* @return The value of the header "Accept-Language" header.
*/
public String getAcceptLanguage() {
- return acceptLanguage;
+ return acceptLanguage;
}
public String getAccept() {
- return accept;
+ return accept;
}
public boolean getUseHttp11() {
return useHttp11;
}
-
+
public Set<String> getTlsPreferredCipherSuites() {
return tlsPreferredCipherSuites;
}
-
+
public Set<String> getTlsPreferredProtocols() {
return tlsPreferredProtocols;
}
- private static String getAgentString(String agentName,
- String agentVersion,
- String agentDesc,
- String agentURL,
- String agentEmail) {
-
- if ( (agentName == null) || (agentName.trim().length() == 0) ) {
+ private static String getAgentString(String agentName, String agentVersion,
+ String agentDesc, String agentURL, String agentEmail) {
+
+ if ((agentName == null) || (agentName.trim().length() == 0)) {
// TODO : NUTCH-258
if (LOGGER.isErrorEnabled()) {
LOGGER.error("No User-Agent string set (http.agent.name)!");
}
}
-
- StringBuffer buf= new StringBuffer();
-
+
+ StringBuffer buf = new StringBuffer();
+
buf.append(agentName);
if (agentVersion != null) {
buf.append("/");
buf.append(agentVersion);
}
- if ( ((agentDesc != null) && (agentDesc.length() != 0))
- || ((agentEmail != null) && (agentEmail.length() != 0))
- || ((agentURL != null) && (agentURL.length() != 0)) ) {
+ if (((agentDesc != null) && (agentDesc.length() != 0))
+ || ((agentEmail != null) && (agentEmail.length() != 0))
+ || ((agentURL != null) && (agentURL.length() != 0))) {
buf.append(" (");
-
+
if ((agentDesc != null) && (agentDesc.length() != 0)) {
buf.append(agentDesc);
- if ( (agentURL != null) || (agentEmail != null) )
+ if ((agentURL != null) || (agentEmail != null))
buf.append("; ");
}
-
+
if ((agentURL != null) && (agentURL.length() != 0)) {
buf.append(agentURL);
if (agentEmail != null)
buf.append("; ");
}
-
+
if ((agentEmail != null) && (agentEmail.length() != 0))
buf.append(agentEmail);
-
+
buf.append(")");
}
return buf.toString();
@@ -354,52 +385,59 @@ public abstract class HttpBase implement
logger.info("http.accept = " + accept);
}
}
-
- public byte[] processGzipEncoded(byte[] compressed, URL url) throws IOException {
- if (LOGGER.isTraceEnabled()) { LOGGER.trace("uncompressing...."); }
+ public byte[] processGzipEncoded(byte[] compressed, URL url)
+ throws IOException {
+
+ if (LOGGER.isTraceEnabled()) {
+ LOGGER.trace("uncompressing....");
+ }
// content can be empty (i.e. redirection) in which case
// there is nothing to unzip
if (compressed.length == 0)
return compressed;
-
+
byte[] content;
if (getMaxContent() >= 0) {
- content = GZIPUtils.unzipBestEffort(compressed, getMaxContent());
+ content = GZIPUtils.unzipBestEffort(compressed, getMaxContent());
} else {
- content = GZIPUtils.unzipBestEffort(compressed);
- }
+ content = GZIPUtils.unzipBestEffort(compressed);
+ }
if (content == null)
throw new IOException("unzipBestEffort returned null");
if (LOGGER.isTraceEnabled()) {
LOGGER.trace("fetched " + compressed.length
- + " bytes of compressed content (expanded to "
- + content.length + " bytes) from " + url);
+ + " bytes of compressed content (expanded to " + content.length
+ + " bytes) from " + url);
}
return content;
}
- public byte[] processDeflateEncoded(byte[] compressed, URL url) throws IOException {
+ public byte[] processDeflateEncoded(byte[] compressed, URL url)
+ throws IOException {
// content can be empty (i.e. redirection) in which case
// there is nothing to deflate
if (compressed.length == 0)
return compressed;
-
- if (LOGGER.isTraceEnabled()) { LOGGER.trace("inflating...."); }
- byte[] content = DeflateUtils.inflateBestEffort(compressed, getMaxContent());
+ if (LOGGER.isTraceEnabled()) {
+ LOGGER.trace("inflating....");
+ }
+
+ byte[] content = DeflateUtils
+ .inflateBestEffort(compressed, getMaxContent());
if (content == null)
throw new IOException("inflateBestEffort returned null");
if (LOGGER.isTraceEnabled()) {
LOGGER.trace("fetched " + compressed.length
- + " bytes of compressed content (expanded to "
- + content.length + " bytes) from " + url);
+ + " bytes of compressed content (expanded to " + content.length
+ + " bytes) from " + url);
}
return content;
}
@@ -407,14 +445,14 @@ public abstract class HttpBase implement
protected static void main(HttpBase http, String[] args) throws Exception {
boolean verbose = false;
String url = null;
-
+
String usage = "Usage: Http [-verbose] [-timeout N] url";
-
+
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
-
+
for (int i = 0; i < args.length; i++) { // parse command line
if (args[i].equals("-timeout")) { // found -timeout option
http.timeout = Integer.parseInt(args[++i]) * 1000;
@@ -423,35 +461,34 @@ public abstract class HttpBase implement
} else if (i != args.length - 1) {
System.err.println(usage);
System.exit(-1);
- } else // root is required parameter
+ } else
+ // root is required parameter
url = args[i];
}
-
-// if (verbose) {
-// LOGGER.setLevel(Level.FINE);
-// }
-
- ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum());
+
+ // if (verbose) {
+ // LOGGER.setLevel(Level.FINE);
+ // }
+
+ ProtocolOutput out = http
+ .getProtocolOutput(new Text(url), new CrawlDatum());
Content content = out.getContent();
-
+
System.out.println("Status: " + out.getStatus());
if (content != null) {
System.out.println("Content Type: " + content.getContentType());
- System.out.println("Content Length: " +
- content.getMetadata().get(Response.CONTENT_LENGTH));
+ System.out.println("Content Length: "
+ + content.getMetadata().get(Response.CONTENT_LENGTH));
System.out.println("Content:");
String text = new String(content.getContent());
System.out.println(text);
- }
+ }
}
-
- protected abstract Response getResponse(URL url,
- CrawlDatum datum,
- boolean followRedirects)
- throws ProtocolException, IOException;
+
+ protected abstract Response getResponse(URL url, CrawlDatum datum,
+ boolean followRedirects) throws ProtocolException, IOException;
public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
return robots.getRobotRulesSet(this, url);
}
}
-
Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java Thu Jan 29 05:38:59 2015
@@ -19,7 +19,6 @@ package org.apache.nutch.protocol.http.a
// Nutch imports
import org.apache.nutch.protocol.ProtocolException;
-
public class HttpException extends ProtocolException {
public HttpException() {
Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Thu Jan 29 05:38:59 2015
@@ -32,36 +32,41 @@ import crawlercommons.robots.BaseRobotRu
import crawlercommons.robots.SimpleRobotRules;
/**
- * This class is used for parsing robots for urls belonging to HTTP protocol.
- * It extends the generic {@link RobotRulesParser} class and contains
- * Http protocol specific implementation for obtaining the robots file.
+ * This class is used for parsing robots for urls belonging to HTTP protocol. It
+ * extends the generic {@link RobotRulesParser} class and contains Http protocol
+ * specific implementation for obtaining the robots file.
*/
public class HttpRobotRulesParser extends RobotRulesParser {
-
- public static final Logger LOG = LoggerFactory.getLogger(HttpRobotRulesParser.class);
+
+ public static final Logger LOG = LoggerFactory
+ .getLogger(HttpRobotRulesParser.class);
protected boolean allowForbidden = false;
- HttpRobotRulesParser() { }
+ HttpRobotRulesParser() {
+ }
public HttpRobotRulesParser(Configuration conf) {
- setConf(conf);
+ setConf(conf);
}
-
+
public void setConf(Configuration conf) {
- super.setConf(conf);
- allowForbidden = conf.getBoolean("http.robots.403.allow", true);
+ super.setConf(conf);
+ allowForbidden = conf.getBoolean("http.robots.403.allow", true);
}
/** Compose unique key to store and access robot rules in cache for given URL */
protected static String getCacheKey(URL url) {
- String protocol = url.getProtocol().toLowerCase(); // normalize to lower case
- String host = url.getHost().toLowerCase(); // normalize to lower case
+ String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+ // case
+ String host = url.getHost().toLowerCase(); // normalize to lower case
int port = url.getPort();
if (port == -1) {
port = url.getDefaultPort();
}
- /* Robot rules apply only to host, protocol, and port where robots.txt is
- * hosted (cf. NUTCH-1752). Consequently */
+ /*
+ * Robot rules apply only to host, protocol, and port where robots.txt is
+ * hosted (cf. NUTCH-1752). Consequently
+ */
String cacheKey = protocol + ":" + host + ":" + port;
return cacheKey;
}
@@ -77,7 +82,7 @@ public class HttpRobotRulesParser extend
* The {@link Protocol} object
* @param url
* URL robots.txt applies to
- *
+ *
* @return {@link BaseRobotRules} holding the rules from robots.txt
*/
public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
@@ -86,13 +91,15 @@ public class HttpRobotRulesParser extend
BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);
boolean cacheRule = true;
-
- if (robotRules == null) { // cache miss
+
+ if (robotRules == null) { // cache miss
URL redir = null;
- if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("cache miss " + url);
+ }
try {
- Response response = ((HttpBase)http).getResponse(new URL(url, "/robots.txt"),
- new CrawlDatum(), true);
+ Response response = ((HttpBase) http).getResponse(new URL(url,
+ "/robots.txt"), new CrawlDatum(), true);
// try one level of redirection ?
if (response.getCode() == 301 || response.getCode() == 302) {
String redirection = response.getHeader("Location");
@@ -107,23 +114,23 @@ public class HttpRobotRulesParser extend
} else {
redir = new URL(redirection);
}
-
- response = ((HttpBase)http).getResponse(redir, new CrawlDatum(), true);
+
+ response = ((HttpBase) http).getResponse(redir, new CrawlDatum(),
+ true);
}
}
- if (response.getCode() == 200) // found rules: parse them
- robotRules = parseRules(url.toString(), response.getContent(),
- response.getHeader("Content-Type"),
- agentNames);
+ if (response.getCode() == 200) // found rules: parse them
+ robotRules = parseRules(url.toString(), response.getContent(),
+ response.getHeader("Content-Type"), agentNames);
- else if ( (response.getCode() == 403) && (!allowForbidden) )
- robotRules = FORBID_ALL_RULES; // use forbid all
+ else if ((response.getCode() == 403) && (!allowForbidden))
+ robotRules = FORBID_ALL_RULES; // use forbid all
else if (response.getCode() >= 500) {
cacheRule = false;
robotRules = EMPTY_RULES;
- }else
- robotRules = EMPTY_RULES; // use default rules
+ } else
+ robotRules = EMPTY_RULES; // use default rules
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
@@ -133,7 +140,7 @@ public class HttpRobotRulesParser extend
}
if (cacheRule) {
- CACHE.put(cacheKey, robotRules); // cache rules for host
+ CACHE.put(cacheKey, robotRules); // cache rules for host
if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
// cache also for the redirected host
CACHE.put(getCacheKey(redir), robotRules);
Modified: nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Thu Jan 29 05:38:59 2015
@@ -23,10 +23,10 @@ import org.junit.Test;
import crawlercommons.robots.BaseRobotRules;
/**
- * JUnit test case which tests
- * 1. that robots filtering is performed correctly as per the agent name
- * 2. that crawl delay is extracted correctly from the robots file
- *
+ * JUnit test case which tests 1. that robots filtering is performed correctly
+ * as per the agent name 2. that crawl delay is extracted correctly from the
+ * robots file
+ *
*/
public class TestRobotRulesParser {
@@ -36,39 +36,32 @@ public class TestRobotRulesParser {
private static final String UNKNOWN_AGENT = "AgentABC";
private static final String CR = "\r";
- private static final String ROBOTS_STRING =
- "User-Agent: Agent1 #foo" + CR
- + "Disallow: /a" + CR
- + "Disallow: /b/a" + CR
- + "#Disallow: /c" + CR
- + "Crawl-delay: 10" + CR // set crawl delay for Agent1 as 10 sec
- + "" + CR
- + "" + CR
- + "User-Agent: Agent2" + CR
- + "Disallow: /a/bloh" + CR
- + "Disallow: /c" + CR
- + "Disallow: /foo" + CR
- + "Crawl-delay: 20" + CR
- + "" + CR
- + "User-Agent: *" + CR
- + "Disallow: /foo/bar/" + CR; // no crawl delay for other agents
+ private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR
+ + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c"
+ + CR
+ + "Crawl-delay: 10"
+ + CR // set crawl delay for Agent1 as 10 sec
+ + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh"
+ + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20"
+ + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no
+ // crawl
+ // delay
+ // for
+ // other
+ // agents
private static final String[] TEST_PATHS = new String[] {
- "http://example.com/a",
- "http://example.com/a/bloh/foo.html",
- "http://example.com/b",
- "http://example.com/c",
- "http://example.com/b/a/index.html",
- "http://example.com/foo/bar/baz.html"
- };
-
- private static final boolean[] RESULTS = new boolean[] {
- false, // /a
- false, // /a/bloh/foo.html
- true, // /b
- true, // /c
- false, // /b/a/index.html
- true // /foo/bar/baz.html
+ "http://example.com/a", "http://example.com/a/bloh/foo.html",
+ "http://example.com/b", "http://example.com/c",
+ "http://example.com/b/a/index.html",
+ "http://example.com/foo/bar/baz.html" };
+
+ private static final boolean[] RESULTS = new boolean[] { false, // /a
+ false, // /a/bloh/foo.html
+ true, // /b
+ true, // /c
+ false, // /b/a/index.html
+ true // /foo/bar/baz.html
};
private HttpRobotRulesParser parser;
@@ -79,41 +72,52 @@ public class TestRobotRulesParser {
}
/**
- * Test that the robots rules are interpreted correctly by the robots rules parser.
+ * Test that the robots rules are interpreted correctly by the robots rules
+ * parser.
*/
@Test
public void testRobotsAgent() {
- rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, SINGLE_AGENT);
- for(int counter = 0; counter < TEST_PATHS.length; counter++) {
- Assert.assertTrue("testing on agent (" + SINGLE_AGENT + "), and "
- + "path " + TEST_PATHS[counter]
- + " got " + rules.isAllowed(TEST_PATHS[counter]),
- rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+ for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+ Assert.assertTrue(
+ "testing on agent (" + SINGLE_AGENT + "), and " + "path "
+ + TEST_PATHS[counter] + " got "
+ + rules.isAllowed(TEST_PATHS[counter]),
+ rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
- rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, MULTIPLE_AGENTS);
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, MULTIPLE_AGENTS);
- for(int counter = 0; counter < TEST_PATHS.length; counter++) {
- Assert.assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and "
- + "path " + TEST_PATHS[counter]
- + " got " + rules.isAllowed(TEST_PATHS[counter]),
- rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+ for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+ Assert.assertTrue(
+ "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
+ + TEST_PATHS[counter] + " got "
+ + rules.isAllowed(TEST_PATHS[counter]),
+ rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
}
/**
- * Test that the crawl delay is extracted from the robots file for respective agent.
- * If its not specified for a given agent, default value must be returned.
+ * Test that the crawl delay is extracted from the robots file for respective
+ * agent. If its not specified for a given agent, default value must be
+ * returned.
*/
@Test
public void testCrawlDelay() {
- // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be returned by the parser
- rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);
- Assert.assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ", (rules.getCrawlDelay() == 10000));
+ // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
+ // returned by the parser
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, SINGLE_AGENT);
+ Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
+ (rules.getCrawlDelay() == 10000));
// for UNKNOWN_AGENT, the default crawl delay must be returned.
- rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, UNKNOWN_AGENT);
- Assert.assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ", (rules.getCrawlDelay() == Long.MIN_VALUE));
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, UNKNOWN_AGENT);
+ Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
+ (rules.getCrawlDelay() == Long.MIN_VALUE));
}
}
Modified: nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java (original)
+++ nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java Thu Jan 29 05:38:59 2015
@@ -16,11 +16,9 @@
*/
package org.apache.nutch.urlfilter.api;
-
-
/**
* A generic regular expression rule.
- *
+ *
* @author Jérôme Charron
*/
public abstract class RegexRule {
@@ -29,13 +27,15 @@ public abstract class RegexRule {
/**
* Constructs a new regular expression rule.
- *
- * @param sign specifies if this rule must filter-in or filter-out.
- * A <code>true</code> value means that any url matching this rule
- * must be accepted, a <code>false</code> value means that any url
- * matching this rule must be rejected.
- * @param regex is the regular expression used for matching (see
- * {@link #match(String)} method).
+ *
+ * @param sign
+ * specifies if this rule must filter-in or filter-out. A
+ * <code>true</code> value means that any url matching this rule must
+ * be accepted, a <code>false</code> value means that any url
+ * matching this rule must be rejected.
+ * @param regex
+ * is the regular expression used for matching (see
+ * {@link #match(String)} method).
*/
protected RegexRule(boolean sign, String regex) {
this.sign = sign;
@@ -43,19 +43,22 @@ public abstract class RegexRule {
/**
* Return if this rule is used for filtering-in or out.
- *
+ *
* @return <code>true</code> if any url matching this rule must be accepted,
* otherwise <code>false</code>.
*/
- protected boolean accept() { return sign; }
-
+ protected boolean accept() {
+ return sign;
+ }
+
/**
* Checks if a url matches this rule.
- * @param url is the url to check.
- * @return <code>true</code> if the specified url matches this rule,
- * otherwise <code>false</code>.
+ *
+ * @param url
+ * is the url to check.
+ * @return <code>true</code> if the specified url matches this rule, otherwise
+ * <code>false</code>.
*/
protected abstract boolean match(String url);
}
-
Modified: nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java (original)
+++ nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java Thu Jan 29 05:38:59 2015
@@ -37,28 +37,32 @@ import org.apache.hadoop.conf.Configurat
// Nutch imports
import org.apache.nutch.net.*;
-
/**
- * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on
- * regular expressions.
- *
- * <p>The regular expressions rules are expressed in a file. The file of rules
- * is determined for each implementation using the
- * {@link #getRulesReader(Configuration conf)} method.</p>
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
+ * expressions.
+ *
+ * <p>
+ * The regular expressions rules are expressed in a file. The file of rules is
+ * determined for each implementation using the
+ * {@link #getRulesReader(Configuration conf)} method.
+ * </p>
*
- * <p>The format of this file is made of many rules (one per line):<br/>
+ * <p>
+ * The format of this file is made of many rules (one per line):<br/>
* <code>
* [+-]<regex>
* </code><br/>
- * where plus (<code>+</code>)means go ahead and index it and minus
- * (<code>-</code>)means no.</p>
- *
+ * where plus (<code>+</code>)means go ahead and index it and minus (
+ * <code>-</code>)means no.
+ * </p>
+ *
* @author Jérôme Charron
*/
public abstract class RegexURLFilterBase implements URLFilter {
/** My logger */
- private final static Logger LOG = LoggerFactory.getLogger(RegexURLFilterBase.class);
+ private final static Logger LOG = LoggerFactory
+ .getLogger(RegexURLFilterBase.class);
/** An array of applicable rules */
private List<RegexRule> rules;
@@ -66,24 +70,28 @@ public abstract class RegexURLFilterBase
/** The current configuration */
private Configuration conf;
-
/**
* Constructs a new empty RegexURLFilterBase
*/
- public RegexURLFilterBase() { }
+ public RegexURLFilterBase() {
+ }
/**
* Constructs a new RegexURLFilter and init it with a file of rules.
- * @param filename is the name of rules file.
+ *
+ * @param filename
+ * is the name of rules file.
*/
- public RegexURLFilterBase(File filename)
- throws IOException, IllegalArgumentException {
+ public RegexURLFilterBase(File filename) throws IOException,
+ IllegalArgumentException {
this(new FileReader(filename));
}
-
+
/**
* Constructs a new RegexURLFilter and inits it with a list of rules.
- * @param rules string with a list of rules, one rule per line
+ *
+ * @param rules
+ * string with a list of rules, one rule per line
* @throws IOException
* @throws IllegalArgumentException
*/
@@ -94,68 +102,82 @@ public abstract class RegexURLFilterBase
/**
* Constructs a new RegexURLFilter and init it with a Reader of rules.
- * @param reader is a reader of rules.
+ *
+ * @param reader
+ * is a reader of rules.
*/
- protected RegexURLFilterBase(Reader reader)
- throws IOException, IllegalArgumentException {
+ protected RegexURLFilterBase(Reader reader) throws IOException,
+ IllegalArgumentException {
rules = readRules(reader);
}
-
+
/**
* Creates a new {@link RegexRule}.
- * @param sign of the regular expression.
- * A <code>true</code> value means that any URL matching this rule
- * must be included, whereas a <code>false</code>
- * value means that any URL matching this rule must be excluded.
- * @param regex is the regular expression associated to this rule.
+ *
+ * @param sign
+ * of the regular expression. A <code>true</code> value means that
+ * any URL matching this rule must be included, whereas a
+ * <code>false</code> value means that any URL matching this rule
+ * must be excluded.
+ * @param regex
+ * is the regular expression associated to this rule.
*/
protected abstract RegexRule createRule(boolean sign, String regex);
-
+
/**
- * Returns the name of the file of rules to use for
- * a particular implementation.
- * @param conf is the current configuration.
+ * Returns the name of the file of rules to use for a particular
+ * implementation.
+ *
+ * @param conf
+ * is the current configuration.
* @return the name of the resource containing the rules to use.
*/
- protected abstract Reader getRulesReader(Configuration conf) throws IOException;
-
-
- /* -------------------------- *
- * <implementation:URLFilter> *
- * -------------------------- */
-
+ protected abstract Reader getRulesReader(Configuration conf)
+ throws IOException;
+
+ /*
+ * -------------------------- * <implementation:URLFilter> *
+ * --------------------------
+ */
+
// Inherited Javadoc
public String filter(String url) {
for (RegexRule rule : rules) {
if (rule.match(url)) {
return rule.accept() ? url : null;
}
- };
+ }
+ ;
return null;
}
- /* --------------------------- *
- * </implementation:URLFilter> *
- * --------------------------- */
-
-
- /* ----------------------------- *
- * <implementation:Configurable> *
- * ----------------------------- */
-
+ /*
+ * --------------------------- * </implementation:URLFilter> *
+ * ---------------------------
+ */
+
+ /*
+ * ----------------------------- * <implementation:Configurable> *
+ * -----------------------------
+ */
+
public void setConf(Configuration conf) {
this.conf = conf;
Reader reader = null;
try {
reader = getRulesReader(conf);
} catch (Exception e) {
- if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
- throw new RuntimeException(e.getMessage(), e);
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
+ throw new RuntimeException(e.getMessage(), e);
}
try {
rules = readRules(reader);
} catch (IOException e) {
- if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
throw new RuntimeException(e.getMessage(), e);
}
}
@@ -163,45 +185,51 @@ public abstract class RegexURLFilterBase
public Configuration getConf() {
return this.conf;
}
-
- /* ------------------------------ *
- * </implementation:Configurable> *
- * ------------------------------ */
-
+
+ /*
+ * ------------------------------ * </implementation:Configurable> *
+ * ------------------------------
+ */
/**
* Read the specified file of rules.
- * @param reader is a reader of regular expressions rules.
+ *
+ * @param reader
+ * is a reader of regular expressions rules.
* @return the corresponding {@RegexRule rules}.
*/
- private List<RegexRule> readRules(Reader reader)
- throws IOException, IllegalArgumentException {
+ private List<RegexRule> readRules(Reader reader) throws IOException,
+ IllegalArgumentException {
BufferedReader in = new BufferedReader(reader);
List<RegexRule> rules = new ArrayList<RegexRule>();
String line;
-
- while((line=in.readLine())!=null) {
+
+ while ((line = in.readLine()) != null) {
if (line.length() == 0) {
continue;
}
- char first=line.charAt(0);
- boolean sign=false;
+ char first = line.charAt(0);
+ boolean sign = false;
switch (first) {
- case '+' :
- sign=true;
+ case '+':
+ sign = true;
break;
- case '-' :
- sign=false;
+ case '-':
+ sign = false;
break;
- case ' ' : case '\n' : case '#' : // skip blank & comment lines
+ case ' ':
+ case '\n':
+ case '#': // skip blank & comment lines
continue;
- default :
- throw new IOException("Invalid first character: "+line);
+ default:
+ throw new IOException("Invalid first character: " + line);
}
String regex = line.substring(1);
- if (LOG.isTraceEnabled()) { LOG.trace("Adding rule [" + regex + "]"); }
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Adding rule [" + regex + "]");
+ }
RegexRule rule = createRule(sign, regex);
rules.add(rule);
}
@@ -210,18 +238,20 @@ public abstract class RegexURLFilterBase
/**
* Filter the standard input using a RegexURLFilterBase.
- * @param filter is the RegexURLFilterBase to use for filtering the
- * standard input.
- * @param args some optional parameters (not used).
+ *
+ * @param filter
+ * is the RegexURLFilterBase to use for filtering the standard input.
+ * @param args
+ * some optional parameters (not used).
*/
public static void main(RegexURLFilterBase filter, String args[])
- throws IOException, IllegalArgumentException {
+ throws IOException, IllegalArgumentException {
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
- while((line=in.readLine())!=null) {
+ while ((line = in.readLine()) != null) {
String out = filter.filter(line);
- if (out!=null) {
+ if (out != null) {
System.out.print("+");
System.out.println(out);
} else {
Modified: nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java (original)
+++ nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java Thu Jan 29 05:38:59 2015
@@ -31,26 +31,25 @@ import org.slf4j.LoggerFactory;
// Nutch imports
import org.apache.nutch.net.URLFilter;
-
/**
* JUnit based test of class <code>RegexURLFilterBase</code>.
- *
+ *
* @author Jérôme Charron
*/
public abstract class RegexURLFilterBaseTest {
/** My logger */
- protected static final Logger LOG = LoggerFactory.getLogger(RegexURLFilterBaseTest.class);
+ protected static final Logger LOG = LoggerFactory
+ .getLogger(RegexURLFilterBaseTest.class);
- private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SEPARATOR = System.getProperty("file.separator");
private final static String SAMPLES = System.getProperty("test.data", ".");
protected abstract URLFilter getURLFilter(Reader rules);
protected void bench(int loops, String file) {
try {
- bench(loops,
- new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+ bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
} catch (Exception e) {
Assert.fail(e.toString());
@@ -62,14 +61,14 @@ public abstract class RegexURLFilterBase
try {
URLFilter filter = getURLFilter(rules);
FilteredURL[] expected = readURLFile(urls);
- for (int i=0; i<loops; i++) {
+ for (int i = 0; i < loops; i++) {
test(filter, expected);
}
} catch (Exception e) {
Assert.fail(e.toString());
}
- LOG.info("bench time (" + loops + ") " +
- (System.currentTimeMillis()-start) + "ms");
+ LOG.info("bench time (" + loops + ") "
+ + (System.currentTimeMillis() - start) + "ms");
}
protected void test(String file) {
@@ -90,7 +89,7 @@ public abstract class RegexURLFilterBase
}
protected void test(URLFilter filter, FilteredURL[] expected) {
- for (int i=0; i<expected.length; i++) {
+ for (int i = 0; i < expected.length; i++) {
String result = filter.filter(expected[i].url);
if (result != null) {
Assert.assertTrue(expected[i].url, expected[i].sign);
@@ -104,7 +103,7 @@ public abstract class RegexURLFilterBase
BufferedReader in = new BufferedReader(reader);
List<FilteredURL> list = new ArrayList<FilteredURL>();
String line;
- while((line=in.readLine()) != null) {
+ while ((line = in.readLine()) != null) {
if (line.length() != 0) {
list.add(new FilteredURL(line));
}
@@ -119,13 +118,13 @@ public abstract class RegexURLFilterBase
FilteredURL(String line) {
switch (line.charAt(0)) {
- case '+' :
+ case '+':
sign = true;
break;
- case '-' :
+ case '-':
sign = false;
break;
- default :
+ default:
// Simply ignore...
}
url = line.substring(1);
Modified: nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -16,7 +16,6 @@
*/
package org.apache.nutch.microformats.reltag;
-
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
@@ -29,29 +28,27 @@ import org.apache.nutch.parse.Parse;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
-
/**
- * An {@link org.apache.nutch.indexer.IndexingFilter} that
- * add <code>tag</code> field(s) to the document.
- *
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that add <code>tag</code>
+ * field(s) to the document.
+ *
* @see <a href="http://www.microformats.org/wiki/rel-tag">
* http://www.microformats.org/wiki/rel-tag</a>
* @author Jérôme Charron
*/
public class RelTagIndexingFilter implements IndexingFilter {
-
private Configuration conf;
-
// Inherited JavaDoc
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
- throws IndexingException {
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// Check if some Rel-Tags found, possibly put there by RelTagParser
- String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG);
+ String[] tags = parse.getData().getParseMeta()
+ .getValues(RelTagParser.REL_TAG);
if (tags != null) {
- for (int i=0; i<tags.length; i++) {
+ for (int i = 0; i < tags.length; i++) {
doc.add("tag", tags[i]);
}
}
@@ -59,10 +56,11 @@ public class RelTagIndexingFilter implem
return doc;
}
- /* ----------------------------- *
- * <implementation:Configurable> *
- * ----------------------------- */
-
+ /*
+ * ----------------------------- * <implementation:Configurable> *
+ * -----------------------------
+ */
+
public void setConf(Configuration conf) {
this.conf = conf;
}
@@ -70,9 +68,10 @@ public class RelTagIndexingFilter implem
public Configuration getConf() {
return this.conf;
}
-
- /* ------------------------------ *
- * </implementation:Configurable> *
- * ------------------------------ */
-
+
+ /*
+ * ------------------------------ * </implementation:Configurable> *
+ * ------------------------------
+ */
+
}
Modified: nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (original)
+++ nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Thu Jan 29 05:38:59 2015
@@ -45,24 +45,24 @@ import org.apache.hadoop.conf.Configurat
/**
* Adds microformat rel-tags of document if found.
- *
+ *
* @see <a href="http://www.microformats.org/wiki/rel-tag">
* http://www.microformats.org/wiki/rel-tag</a>
*/
public class RelTagParser implements HtmlParseFilter {
-
+
public final static Logger LOG = LoggerFactory.getLogger(RelTagParser.class);
public final static String REL_TAG = "Rel-Tag";
-
+
private Configuration conf = null;
-
+
/**
* Scan the HTML document looking at possible rel-tags
*/
public ParseResult filter(Content content, ParseResult parseResult,
- HTMLMetaTags metaTags, DocumentFragment doc) {
-
+ HTMLMetaTags metaTags, DocumentFragment doc) {
+
// get parse obj
Parse parse = parseResult.get(content.getUrl());
// Trying to find the document's rel-tags
@@ -79,16 +79,16 @@ public class RelTagParser implements Htm
private static class Parser {
Set<String> tags = null;
-
+
Parser(Node node) {
tags = new TreeSet<String>();
parse(node);
}
-
+
Set<String> getRelTags() {
return tags;
}
-
+
void parse(Node node) {
if (node.getNodeType() == Node.ELEMENT_NODE) {
@@ -105,7 +105,7 @@ public class RelTagParser implements Htm
if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
String tag = parseTag(hrefNode.getNodeValue());
if (!StringUtil.isEmpty(tag)) {
- if(!tags.contains(tag)){
+ if (!tags.contains(tag)) {
tags.add(tag);
LOG.debug("Adding tag: " + tag + " to tag set.");
}
@@ -115,26 +115,27 @@ public class RelTagParser implements Htm
}
}
}
-
+
// Recurse
NodeList children = node.getChildNodes();
- for (int i=0; children != null && i<children.getLength(); i++)
+ for (int i = 0; children != null && i < children.getLength(); i++)
parse(children.item(i));
}
-
+
private final static String parseTag(String url) {
String tag = null;
try {
URL u = new URL(url);
String path = u.getPath();
- tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), "UTF-8");
+ tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1),
+ "UTF-8");
} catch (Exception e) {
// Malformed tag...
tag = null;
}
return tag;
}
-
+
}
public void setConf(Configuration conf) {
Modified: nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Thu Jan 29 05:38:59 2015
@@ -50,18 +50,21 @@ import java.nio.charset.Charset;
public class ExtParser implements Parser {
- public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.ext");
+ public static final Logger LOG = LoggerFactory
+ .getLogger("org.apache.nutch.parse.ext");
static final int BUFFER_SIZE = 4096;
static final int TIMEOUT_DEFAULT = 30; // in seconds
- // handy map from String contentType to String[] {command, timeoutString, encoding}
+ // handy map from String contentType to String[] {command, timeoutString,
+ // encoding}
Hashtable<String, String[]> TYPE_PARAMS_MAP = new Hashtable<String, String[]>();
- private Configuration conf;
+ private Configuration conf;
- public ExtParser () { }
+ public ExtParser() {
+ }
public ParseResult getParse(Content content) {
@@ -70,14 +73,15 @@ public class ExtParser implements Parser
String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
if (params == null)
return new ParseStatus(ParseStatus.FAILED,
- "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf());
+ "No external command defined for contentType: " + contentType)
+ .getEmptyParseResult(content.getUrl(), getConf());
String command = params[0];
int timeout = Integer.parseInt(params[1]);
String encoding = params[2];
if (LOG.isTraceEnabled()) {
- LOG.trace("Use "+command+ " with timeout="+timeout+"secs");
+ LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
}
String text = null;
@@ -89,19 +93,19 @@ public class ExtParser implements Parser
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null
- && raw.length != Integer.parseInt(contentLength)) {
- return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
- "Content truncated at " + raw.length
- +" bytes. Parser can't handle incomplete "
- + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf());
+ && raw.length != Integer.parseInt(contentLength)) {
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
+ + " bytes. Parser can't handle incomplete " + contentType
+ + " file.").getEmptyParseResult(content.getUrl(), getConf());
}
ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
- ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4);
+ ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
CommandRunner cr = new CommandRunner();
- cr.setCommand(command+ " " +contentType);
+ cr.setCommand(command + " " + contentType);
cr.setInputStream(new ByteArrayInputStream(raw));
cr.setStdOutputStream(os);
cr.setStdErrorStream(es);
@@ -111,14 +115,15 @@ public class ExtParser implements Parser
cr.evaluate();
if (cr.getExitValue() != 0)
- return new ParseStatus(ParseStatus.FAILED,
- "External command " + command
- + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());
+ return new ParseStatus(ParseStatus.FAILED, "External command "
+ + command + " failed with error: " + es.toString())
+ .getEmptyParseResult(content.getUrl(), getConf());
text = os.toString(encoding);
} catch (Exception e) { // run time exception
- return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
+ return new ParseStatus(e)
+ .getEmptyParseResult(content.getUrl(), getConf());
}
if (text == null)
@@ -131,15 +136,15 @@ public class ExtParser implements Parser
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
- outlinks, content.getMetadata());
- return ParseResult.createParseResult(content.getUrl(),
- new ParseImpl(text, parseData));
+ outlinks, content.getMetadata());
+ return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
+ parseData));
}
-
+
public void setConf(Configuration conf) {
this.conf = conf;
- Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
- "org.apache.nutch.parse.Parser").getExtensions();
+ Extension[] extensions = PluginRepository.get(conf)
+ .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions();
String contentType, command, timeoutString, encoding;
@@ -161,13 +166,14 @@ public class ExtParser implements Parser
// null encoding means default
encoding = extension.getAttribute("encoding");
if (encoding == null)
- encoding = Charset.defaultCharset().name();
+ encoding = Charset.defaultCharset().name();
timeoutString = extension.getAttribute("timeout");
if (timeoutString == null || timeoutString.equals(""))
timeoutString = "" + TIMEOUT_DEFAULT;
- TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString, encoding });
+ TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString,
+ encoding });
}
}
Modified: nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java (original)
+++ nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
* Parse wrapper to run external command to do the parsing.
*/
package org.apache.nutch.parse.ext;
+
Modified: nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original)
+++ nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Thu Jan 29 05:38:59 2015
@@ -37,15 +37,14 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
-/**
- * Unit tests for ExtParser.
- * First creates a temp file with fixed content, then fetch
- * and parse it using external command 'cat' and 'md5sum' alternately
- * for 10 times. Doing so also does a light stress test for class
- * CommandRunner.java (as used in ExtParser.java).
- *
+/**
+ * Unit tests for ExtParser. First creates a temp file with fixed content, then
+ * fetch and parse it using external command 'cat' and 'md5sum' alternately for
+ * 10 times. Doing so also does a light stress test for class CommandRunner.java
+ * (as used in ExtParser.java).
+ *
* Warning: currently only do test on linux platform.
- *
+ *
* @author John Xing
*/
public class TestExtParser {
@@ -67,10 +66,11 @@ public class TestExtParser {
File tempDir = new File(path);
if (!tempDir.exists())
tempDir.mkdir();
- tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt",tempDir);
+ tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt",
+ tempDir);
} else {
// otherwise in java.io.tmpdir
- tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt");
+ tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt");
}
urlString = tempFile.toURI().toURL().toString();
@@ -79,8 +79,10 @@ public class TestExtParser {
fos.close();
// get nutch content
- Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
+ Protocol protocol = new ProtocolFactory(NutchConfiguration.create())
+ .getProtocol(urlString);
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
+ .getContent();
protocol = null;
}
@@ -90,8 +92,8 @@ public class TestExtParser {
content = null;
// clean temp file
- //if (tempFile != null && tempFile.exists())
- // tempFile.delete();
+ // if (tempFile != null && tempFile.exists())
+ // tempFile.delete();
}
@Test
@@ -100,24 +102,27 @@ public class TestExtParser {
// now test only on linux platform
if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
- System.err.println("Current OS is "+System.getProperty("os.name")+".");
+ System.err
+ .println("Current OS is " + System.getProperty("os.name") + ".");
System.err.println("No test is run on OS other than linux.");
return;
}
Configuration conf = NutchConfiguration.create();
// loop alternately, total 10*2 times of invoking external command
- for (int i=0; i<10; i++) {
+ for (int i = 0; i < 10; i++) {
// check external parser that does 'cat'
contentType = "application/vnd.nutch.example.cat";
content.setContentType(contentType);
- parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
- Assert.assertEquals(expectedText,parse.getText());
+ parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
+ content.getUrl());
+ Assert.assertEquals(expectedText, parse.getText());
// check external parser that does 'md5sum'
contentType = "application/vnd.nutch.example.md5sum";
content.setContentType(contentType);
- parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
+ parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
+ content.getUrl());
Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
}
}