You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/09 07:34:37 UTC
svn commit: r1650447 [15/25] - in /nutch/branches/2.x: ./
src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/
src/java/org/apache/nutch/api/impl/db/
src/java/org/apache/nutch/api/model/response/
src/java/org/apache/nutch/api/resources/ s...
Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Fri Jan 9 06:34:33 2015
@@ -44,7 +44,7 @@ import org.apache.nutch.util.MimeUtil;
import crawlercommons.robots.BaseRobotRules;
public abstract class HttpBase implements Protocol {
-
+
private final static Utf8 RESPONSE_TIME = new Utf8("_rs_");
public static final int BUFFER_SIZE = 8 * 1024;
@@ -69,15 +69,12 @@ public abstract class HttpBase implement
protected int maxContent = 64 * 1024;
/** The Nutch 'User-Agent' request header */
- protected String userAgent = getAgentString(
- "NutchCVS", null, "Nutch",
- "http://nutch.apache.org/bot.html",
- "agent@nutch.apache.org");
-
+ protected String userAgent = getAgentString("NutchCVS", null, "Nutch",
+ "http://nutch.apache.org/bot.html", "agent@nutch.apache.org");
/** The "Accept-Language" request header value. */
protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
-
+
/** The "Accept" request header value. */
protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
@@ -97,13 +94,13 @@ public abstract class HttpBase implement
/** Response Time */
protected boolean responseTime = true;
-
+
/** Which TLS/SSL protocols to support */
protected Set<String> tlsPreferredProtocols;
-
+
/** Which TLS/SSL cipher suites to support */
protected Set<String> tlsPreferredCipherSuites;
-
+
/** Creates a new instance of HttpBase */
public HttpBase() {
this(null);
@@ -125,37 +122,62 @@ public abstract class HttpBase implement
this.useProxy = (proxyHost != null && proxyHost.length() > 0);
this.timeout = conf.getInt("http.timeout", 10000);
this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
- this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf
- .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
+ this.userAgent = getAgentString(conf.get("http.agent.name"),
+ conf.get("http.agent.version"), conf.get("http.agent.description"),
+ conf.get("http.agent.url"), conf.get("http.agent.email"));
this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
this.accept = conf.get("http.accept", accept);
this.mimeTypes = new MimeUtil(conf);
this.useHttp11 = conf.getBoolean("http.useHttp11", false);
this.responseTime = conf.getBoolean("http.store.responsetime", true);
this.robots.setConf(conf);
-
- String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
- String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
- "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
- "TLS_RSA_WITH_AES_256_CBC_SHA256","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
- "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256","TLS_DHE_DSS_WITH_AES_256_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
- "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA","TLS_RSA_WITH_AES_256_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
- "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
- "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256","TLS_RSA_WITH_AES_128_CBC_SHA256",
- "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256","TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
- "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA",
- "TLS_RSA_WITH_AES_128_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
- "TLS_DHE_RSA_WITH_AES_128_CBC_SHA","TLS_DHE_DSS_WITH_AES_128_CBC_SHA","TLS_ECDHE_ECDSA_WITH_RC4_128_SHA",
- "TLS_ECDHE_RSA_WITH_RC4_128_SHA","SSL_RSA_WITH_RC4_128_SHA","TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
- "TLS_ECDH_RSA_WITH_RC4_128_SHA","TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA",
- "SSL_RSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
- "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA","SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA","SSL_RSA_WITH_RC4_128_MD5",
- "TLS_EMPTY_RENEGOTIATION_INFO_SCSV","TLS_RSA_WITH_NULL_SHA256","TLS_ECDHE_ECDSA_WITH_NULL_SHA",
- "TLS_ECDHE_RSA_WITH_NULL_SHA","SSL_RSA_WITH_NULL_SHA","TLS_ECDH_ECDSA_WITH_NULL_SHA","TLS_ECDH_RSA_WITH_NULL_SHA",
- "SSL_RSA_WITH_NULL_MD5","SSL_RSA_WITH_DES_CBC_SHA","SSL_DHE_RSA_WITH_DES_CBC_SHA","SSL_DHE_DSS_WITH_DES_CBC_SHA",
- "TLS_KRB5_WITH_RC4_128_SHA","TLS_KRB5_WITH_RC4_128_MD5","TLS_KRB5_WITH_3DES_EDE_CBC_SHA","TLS_KRB5_WITH_3DES_EDE_CBC_MD5",
- "TLS_KRB5_WITH_DES_CBC_SHA","TLS_KRB5_WITH_DES_CBC_MD5");
-
+
+ String[] protocols = conf.getStrings("http.tls.supported.protocols",
+ "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
+ String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
+ "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
+ "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
+ "TLS_RSA_WITH_AES_256_CBC_SHA256",
+ "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
+ "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
+ "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
+ "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256",
+ "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
+ "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
+ "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
+ "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA",
+ "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
+ "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
+ "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256",
+ "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
+ "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
+ "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA",
+ "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
+ "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA",
+ "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA",
+ "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
+ "TLS_ECDH_RSA_WITH_RC4_128_SHA",
+ "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA",
+ "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA",
+ "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
+ "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
+ "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA",
+ "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5",
+ "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256",
+ "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA",
+ "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA",
+ "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5",
+ "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA",
+ "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
+ "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA",
+ "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA",
+ "TLS_KRB5_WITH_DES_CBC_MD5");
+
tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
@@ -171,81 +193,90 @@ public abstract class HttpBase implement
try {
URL u = new URL(url);
-
+
long startTime = System.currentTimeMillis();
Response response = getResponse(u, page, false); // make a request
- int elapsedTime =(int) (System.currentTimeMillis() - startTime);
-
- if(this.responseTime) {
- page.getMetadata().put(RESPONSE_TIME, ByteBuffer.wrap(Bytes.toBytes(elapsedTime)));
+ int elapsedTime = (int) (System.currentTimeMillis() - startTime);
+
+ if (this.responseTime) {
+ page.getMetadata().put(RESPONSE_TIME,
+ ByteBuffer.wrap(Bytes.toBytes(elapsedTime)));
}
-
+
int code = response.getCode();
byte[] content = response.getContent();
Content c = new Content(u.toString(), u.toString(),
(content == null ? EMPTY_CONTENT : content),
- response.getHeader("Content-Type"),
- response.getHeaders(), mimeTypes);
+ response.getHeader("Content-Type"), response.getHeaders(), mimeTypes);
if (code == 200) { // got a good response
return new ProtocolOutput(c); // return it
} else if (code >= 300 && code < 400) { // handle redirect
String location = response.getHeader("Location");
// some broken servers, such as MS IIS, use lowercase header name...
- if (location == null) location = response.getHeader("location");
- if (location == null) location = "";
+ if (location == null)
+ location = response.getHeader("location");
+ if (location == null)
+ location = "";
u = new URL(u, location);
int protocolStatusCode;
switch (code) {
- case 300: // multiple choices, preferred value in Location
+ case 300: // multiple choices, preferred value in Location
protocolStatusCode = ProtocolStatusCodes.MOVED;
break;
- case 301: // moved permanently
- case 305: // use proxy (Location is URL of proxy)
+ case 301: // moved permanently
+ case 305: // use proxy (Location is URL of proxy)
protocolStatusCode = ProtocolStatusCodes.MOVED;
break;
- case 302: // found (temporarily moved)
- case 303: // see other (redirect after POST)
- case 307: // temporary redirect
+ case 302: // found (temporarily moved)
+ case 303: // see other (redirect after POST)
+ case 307: // temporary redirect
protocolStatusCode = ProtocolStatusUtils.TEMP_MOVED;
break;
- case 304: // not modified
+ case 304: // not modified
protocolStatusCode = ProtocolStatusUtils.NOTMODIFIED;
break;
default:
protocolStatusCode = ProtocolStatusUtils.MOVED;
}
// handle this in the higher layer.
- return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(protocolStatusCode, u));
+ return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(
+ protocolStatusCode, u));
} else if (code == 400) { // bad request, mark as GONE
- if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); }
- return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, u));
- } else if (code == 401) { // requires authorization, but no valid auth provided.
- if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); }
- return new ProtocolOutput(c,
- ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.ACCESS_DENIED,
- "Authentication required: "+ url));
+ if (logger.isTraceEnabled()) {
+ logger.trace("400 Bad request: " + u);
+ }
+ return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(
+ ProtocolStatusCodes.GONE, u));
+ } else if (code == 401) { // requires authorization, but no valid auth
+ // provided.
+ if (logger.isTraceEnabled()) {
+ logger.trace("401 Authentication Required");
+ }
+ return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(
+ ProtocolStatusCodes.ACCESS_DENIED, "Authentication required: "
+ + url));
} else if (code == 404) {
- return new ProtocolOutput(c,
- ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.NOTFOUND, u));
+ return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(
+ ProtocolStatusCodes.NOTFOUND, u));
} else if (code == 410) { // permanently GONE
- return new ProtocolOutput(c,
- ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, "Http: " + code + " url=" + u));
+ return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(
+ ProtocolStatusCodes.GONE, "Http: " + code + " url=" + u));
} else {
- return new ProtocolOutput(c,
- ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.EXCEPTION, "Http code=" + code + ", url="
- + u));
+ return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(
+ ProtocolStatusCodes.EXCEPTION, "Http code=" + code + ", url=" + u));
}
} catch (Throwable e) {
logger.error("Failed with the following error: ", e);
- return new ProtocolOutput(null,
- ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.EXCEPTION, e.toString()));
+ return new ProtocolOutput(null, ProtocolStatusUtils.makeStatus(
+ ProtocolStatusCodes.EXCEPTION, e.toString()));
}
}
- /* -------------------------- *
- * </implementation:Protocol> *
- * -------------------------- */
+ /*
+ * -------------------------- * </implementation:Protocol> *
+ * --------------------------
+ */
public String getProxyHost() {
return proxyHost;
}
@@ -269,58 +300,57 @@ public abstract class HttpBase implement
public String getUserAgent() {
return userAgent;
}
-
- /** Value of "Accept-Language" request header sent by Nutch.
+
+ /**
+ * Value of "Accept-Language" request header sent by Nutch.
+ *
* @return The value of the header "Accept-Language" header.
*/
public String getAcceptLanguage() {
- return acceptLanguage;
+ return acceptLanguage;
}
public String getAccept() {
- return accept;
+ return accept;
}
public boolean getUseHttp11() {
return useHttp11;
}
-
+
public Set<String> getTlsPreferredCipherSuites() {
return tlsPreferredCipherSuites;
}
-
+
public Set<String> getTlsPreferredProtocols() {
return tlsPreferredProtocols;
}
- private static String getAgentString(String agentName,
- String agentVersion,
- String agentDesc,
- String agentURL,
- String agentEmail) {
+ private static String getAgentString(String agentName, String agentVersion,
+ String agentDesc, String agentURL, String agentEmail) {
- if ( (agentName == null) || (agentName.trim().length() == 0) ) {
+ if ((agentName == null) || (agentName.trim().length() == 0)) {
// TODO : NUTCH-258
if (LOGGER.isErrorEnabled()) {
LOGGER.error("No User-Agent string set (http.agent.name)!");
}
}
- StringBuffer buf= new StringBuffer();
+ StringBuffer buf = new StringBuffer();
buf.append(agentName);
if (agentVersion != null) {
buf.append("/");
buf.append(agentVersion);
}
- if ( ((agentDesc != null) && (agentDesc.length() != 0))
+ if (((agentDesc != null) && (agentDesc.length() != 0))
|| ((agentEmail != null) && (agentEmail.length() != 0))
- || ((agentURL != null) && (agentURL.length() != 0)) ) {
+ || ((agentURL != null) && (agentURL.length() != 0))) {
buf.append(" (");
if ((agentDesc != null) && (agentDesc.length() != 0)) {
buf.append(agentDesc);
- if ( (agentURL != null) || (agentEmail != null) )
+ if ((agentURL != null) || (agentEmail != null))
buf.append("; ");
}
@@ -350,9 +380,12 @@ public abstract class HttpBase implement
}
}
- public byte[] processGzipEncoded(byte[] compressed, URL url) throws IOException {
+ public byte[] processGzipEncoded(byte[] compressed, URL url)
+ throws IOException {
- if (LOGGER.isTraceEnabled()) { LOGGER.trace("uncompressing...."); }
+ if (LOGGER.isTraceEnabled()) {
+ LOGGER.trace("uncompressing....");
+ }
byte[] content;
if (getMaxContent() >= 0) {
@@ -366,25 +399,29 @@ public abstract class HttpBase implement
if (LOGGER.isTraceEnabled()) {
LOGGER.trace("fetched " + compressed.length
- + " bytes of compressed content (expanded to "
- + content.length + " bytes) from " + url);
+ + " bytes of compressed content (expanded to " + content.length
+ + " bytes) from " + url);
}
return content;
}
- public byte[] processDeflateEncoded(byte[] compressed, URL url) throws IOException {
+ public byte[] processDeflateEncoded(byte[] compressed, URL url)
+ throws IOException {
- if (LOGGER.isTraceEnabled()) { LOGGER.trace("inflating...."); }
+ if (LOGGER.isTraceEnabled()) {
+ LOGGER.trace("inflating....");
+ }
- byte[] content = DeflateUtils.inflateBestEffort(compressed, getMaxContent());
+ byte[] content = DeflateUtils
+ .inflateBestEffort(compressed, getMaxContent());
if (content == null)
throw new IOException("inflateBestEffort returned null");
if (LOGGER.isTraceEnabled()) {
LOGGER.trace("fetched " + compressed.length
- + " bytes of compressed content (expanded to "
- + content.length + " bytes) from " + url);
+ + " bytes of compressed content (expanded to " + content.length
+ + " bytes) from " + url);
}
return content;
}
@@ -409,27 +446,28 @@ public abstract class HttpBase implement
} else if (i != args.length - 1) {
System.err.println(usage);
System.exit(-1);
- } else // root is required parameter
+ } else
+ // root is required parameter
url = args[i];
}
- ProtocolOutput out = http.getProtocolOutput(url, WebPage.newBuilder().build());
+ ProtocolOutput out = http.getProtocolOutput(url, WebPage.newBuilder()
+ .build());
Content content = out.getContent();
System.out.println("Status: " + out.getStatus());
if (content != null) {
System.out.println("Content Type: " + content.getContentType());
- System.out.println("Content Length: " +
- content.getMetadata().get(Response.CONTENT_LENGTH));
+ System.out.println("Content Length: "
+ + content.getMetadata().get(Response.CONTENT_LENGTH));
System.out.println("Content:");
String text = new String(content.getContent());
System.out.println(text);
}
}
- protected abstract Response getResponse(URL url,
- WebPage page, boolean followRedirects)
- throws ProtocolException, IOException;
+ protected abstract Response getResponse(URL url, WebPage page,
+ boolean followRedirects) throws ProtocolException, IOException;
@Override
public BaseRobotRules getRobotRules(String url, WebPage page) {
Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java Fri Jan 9 06:34:33 2015
@@ -19,7 +19,6 @@ package org.apache.nutch.protocol.http.a
// Nutch imports
import org.apache.nutch.protocol.ProtocolException;
-
public class HttpException extends ProtocolException {
public HttpException() {
Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Fri Jan 9 06:34:33 2015
@@ -30,16 +30,18 @@ import org.slf4j.LoggerFactory;
import java.net.URL;
/**
- * This class is used for parsing robots for urls belonging to HTTP protocol.
- * It extends the generic {@link RobotRulesParser} class and contains
- * Http protocol specific implementation for obtaining the robots file.
+ * This class is used for parsing robots for urls belonging to HTTP protocol. It
+ * extends the generic {@link RobotRulesParser} class and contains Http protocol
+ * specific implementation for obtaining the robots file.
*/
public class HttpRobotRulesParser extends RobotRulesParser {
-
- public static final Logger LOG = LoggerFactory.getLogger(HttpRobotRulesParser.class);
+
+ public static final Logger LOG = LoggerFactory
+ .getLogger(HttpRobotRulesParser.class);
protected boolean allowForbidden = false;
- HttpRobotRulesParser() { }
+ HttpRobotRulesParser() {
+ }
public HttpRobotRulesParser(Configuration conf) {
super(conf);
@@ -48,14 +50,17 @@ public class HttpRobotRulesParser extend
/** Compose unique key to store and access robot rules in cache for given URL */
protected static String getCacheKey(URL url) {
- String protocol = url.getProtocol().toLowerCase(); // normalize to lower case
- String host = url.getHost().toLowerCase(); // normalize to lower case
+ String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+ // case
+ String host = url.getHost().toLowerCase(); // normalize to lower case
int port = url.getPort();
if (port == -1) {
port = url.getDefaultPort();
}
- /* Robot rules apply only to host, protocol, and port where robots.txt is
- * hosted (cf. NUTCH-1752). Consequently */
+ /*
+ * Robot rules apply only to host, protocol, and port where robots.txt is
+ * hosted (cf. NUTCH-1752). Consequently
+ */
String cacheKey = protocol + ":" + host + ":" + port;
return cacheKey;
}
@@ -71,7 +76,7 @@ public class HttpRobotRulesParser extend
* The {@link Protocol} object
* @param url
* URL robots.txt applies to
- *
+ *
* @return {@link BaseRobotRules} holding the rules from robots.txt
*/
public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
@@ -80,13 +85,15 @@ public class HttpRobotRulesParser extend
BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);
boolean cacheRule = true;
-
- if (robotRules == null) { // cache miss
+
+ if (robotRules == null) { // cache miss
URL redir = null;
- if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("cache miss " + url);
+ }
try {
- Response response = ((HttpBase)http).getResponse(new URL(url, "/robots.txt"),
- WebPage.newBuilder().build(), true);
+ Response response = ((HttpBase) http).getResponse(new URL(url,
+ "/robots.txt"), WebPage.newBuilder().build(), true);
// try one level of redirection ?
if (response.getCode() == 301 || response.getCode() == 302) {
String redirection = response.getHeader("Location");
@@ -101,23 +108,23 @@ public class HttpRobotRulesParser extend
} else {
redir = new URL(redirection);
}
-
- response = ((HttpBase)http).getResponse(redir, WebPage.newBuilder().build(), true);
+
+ response = ((HttpBase) http).getResponse(redir, WebPage
+ .newBuilder().build(), true);
}
}
- if (response.getCode() == 200) // found rules: parse them
- robotRules = parseRules(url.toString(), response.getContent(),
- response.getHeader("Content-Type"),
- agentNames);
+ if (response.getCode() == 200) // found rules: parse them
+ robotRules = parseRules(url.toString(), response.getContent(),
+ response.getHeader("Content-Type"), agentNames);
- else if ( (response.getCode() == 403) && (!allowForbidden) )
- robotRules = FORBID_ALL_RULES; // use forbid all
+ else if ((response.getCode() == 403) && (!allowForbidden))
+ robotRules = FORBID_ALL_RULES; // use forbid all
else if (response.getCode() >= 500) {
cacheRule = false;
robotRules = EMPTY_RULES;
- }else
- robotRules = EMPTY_RULES; // use default rules
+ } else
+ robotRules = EMPTY_RULES; // use default rules
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
@@ -127,7 +134,7 @@ public class HttpRobotRulesParser extend
}
if (cacheRule) {
- CACHE.put(cacheKey, robotRules); // cache rules for host
+ CACHE.put(cacheKey, robotRules); // cache rules for host
if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
// cache also for the redirected host
CACHE.put(getCacheKey(redir), robotRules);
Modified: nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Fri Jan 9 06:34:33 2015
@@ -25,10 +25,10 @@ import crawlercommons.robots.BaseRobotRu
import static org.junit.Assert.*;
/**
- * JUnit test case which tests
- * 1. that robots filtering is performed correctly as per the agent name
- * 2. that crawl delay is extracted correctly from the robots file
- *
+ * JUnit test case which tests 1. that robots filtering is performed correctly
+ * as per the agent name 2. that crawl delay is extracted correctly from the
+ * robots file
+ *
*/
public class TestRobotRulesParser {
@@ -37,40 +37,33 @@ public class TestRobotRulesParser {
private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
private static final String UNKNOWN_AGENT = "AgentABC";
private static final String CR = "\r";
-
- private static final String ROBOTS_STRING =
- "User-Agent: Agent1 #foo" + CR
- + "Disallow: /a" + CR
- + "Disallow: /b/a" + CR
- + "#Disallow: /c" + CR
- + "Crawl-delay: 10" + CR // set crawl delay for Agent1 as 10 sec
- + "" + CR
- + "" + CR
- + "User-Agent: Agent2" + CR
- + "Disallow: /a/bloh" + CR
- + "Disallow: /c" + CR
- + "Disallow: /foo" + CR
- + "Crawl-delay: 20" + CR
- + "" + CR
- + "User-Agent: *" + CR
- + "Disallow: /foo/bar/" + CR; // no crawl delay for other agents
-
- private static final String[] TEST_PATHS = new String[] {
- "http://example.com/a",
- "http://example.com/a/bloh/foo.html",
- "http://example.com/b",
- "http://example.com/c",
- "http://example.com/b/a/index.html",
- "http://example.com/foo/bar/baz.html"
- };
- private static final boolean[] RESULTS = new boolean[] {
- false, // /a
- false, // /a/bloh/foo.html
- true, // /b
- true, // /c
- false, // /b/a/index.html
- true // /foo/bar/baz.html
+ private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR
+ + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c"
+ + CR
+ + "Crawl-delay: 10"
+ + CR // set crawl delay for Agent1 as 10 sec
+ + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh"
+ + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20"
+ + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no
+ // crawl
+ // delay
+ // for
+ // other
+ // agents
+
+ private static final String[] TEST_PATHS = new String[] {
+ "http://example.com/a", "http://example.com/a/bloh/foo.html",
+ "http://example.com/b", "http://example.com/c",
+ "http://example.com/b/a/index.html",
+ "http://example.com/foo/bar/baz.html" };
+
+ private static final boolean[] RESULTS = new boolean[] { false, // /a
+ false, // /a/bloh/foo.html
+ true, // /b
+ true, // /c
+ false, // /b/a/index.html
+ true // /foo/bar/baz.html
};
private HttpRobotRulesParser parser;
@@ -82,41 +75,52 @@ public class TestRobotRulesParser {
}
/**
- * Test that the robots rules are interpreted correctly by the robots rules parser.
- */
+ * Test that the robots rules are interpreted correctly by the robots rules
+ * parser.
+ */
@Test
public void testRobotsAgent() {
- rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, SINGLE_AGENT);
- for(int counter = 0; counter < TEST_PATHS.length; counter++) {
- assertTrue("testing on agent (" + SINGLE_AGENT + "), and "
- + "path " + TEST_PATHS[counter]
- + " got " + rules.isAllowed(TEST_PATHS[counter]),
- rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+ for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+ assertTrue(
+ "testing on agent (" + SINGLE_AGENT + "), and " + "path "
+ + TEST_PATHS[counter] + " got "
+ + rules.isAllowed(TEST_PATHS[counter]),
+ rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
- rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, MULTIPLE_AGENTS);
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, MULTIPLE_AGENTS);
- for(int counter = 0; counter < TEST_PATHS.length; counter++) {
- assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and "
- + "path " + TEST_PATHS[counter]
- + " got " + rules.isAllowed(TEST_PATHS[counter]),
- rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+ for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+ assertTrue(
+ "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
+ + TEST_PATHS[counter] + " got "
+ + rules.isAllowed(TEST_PATHS[counter]),
+ rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
}
/**
- * Test that the crawl delay is extracted from the robots file for respective agent.
- * If its not specified for a given agent, default value must be returned.
- */
+ * Test that the crawl delay is extracted from the robots file for respective
+ * agent. If its not specified for a given agent, default value must be
+ * returned.
+ */
@Test
public void testCrawlDelay() {
- // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be returned by the parser
- rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);
- assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ", (rules.getCrawlDelay() == 10000));
-
+ // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
+ // returned by the parser
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, SINGLE_AGENT);
+ assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
+ (rules.getCrawlDelay() == 10000));
+
// for UNKNOWN_AGENT, the default crawl delay must be returned.
- rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, UNKNOWN_AGENT);
- assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ", (rules.getCrawlDelay() == Long.MIN_VALUE));
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, UNKNOWN_AGENT);
+ assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
+ (rules.getCrawlDelay() == Long.MIN_VALUE));
}
}
Modified: nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java (original)
+++ nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java Fri Jan 9 06:34:33 2015
@@ -16,11 +16,9 @@
*/
package org.apache.nutch.urlfilter.api;
-
-
/**
* A generic regular expression rule.
- *
+ *
* @author Jérôme Charron
*/
public abstract class RegexRule {
@@ -29,13 +27,15 @@ public abstract class RegexRule {
/**
* Constructs a new regular expression rule.
- *
- * @param sign specifies if this rule must filter-in or filter-out.
- * A <code>true</code> value means that any url matching this rule
- * must be accepted, a <code>false</code> value means that any url
- * matching this rule must be rejected.
- * @param regex is the regular expression used for matching (see
- * {@link #match(String)} method).
+ *
+ * @param sign
+ * specifies if this rule must filter-in or filter-out. A
+ * <code>true</code> value means that any url matching this rule must
+ * be accepted, a <code>false</code> value means that any url
+ * matching this rule must be rejected.
+ * @param regex
+ * is the regular expression used for matching (see
+ * {@link #match(String)} method).
*/
protected RegexRule(boolean sign, String regex) {
this.sign = sign;
@@ -43,19 +43,22 @@ public abstract class RegexRule {
/**
* Return if this rule is used for filtering-in or out.
- *
+ *
* @return <code>true</code> if any url matching this rule must be accepted,
* otherwise <code>false</code>.
*/
- protected boolean accept() { return sign; }
-
+ protected boolean accept() {
+ return sign;
+ }
+
/**
* Checks if a url matches this rule.
- * @param url is the url to check.
- * @return <code>true</code> if the specified url matches this rule,
- * otherwise <code>false</code>.
+ *
+ * @param url
+ * is the url to check.
+ * @return <code>true</code> if the specified url matches this rule, otherwise
+ * <code>false</code>.
*/
protected abstract boolean match(String url);
}
-
Modified: nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java (original)
+++ nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java Fri Jan 9 06:34:33 2015
@@ -37,27 +37,30 @@ import org.apache.hadoop.conf.Configurat
// Nutch imports
import org.apache.nutch.net.*;
-
/**
- * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on
- * regular expressions.
- *
- * <p>The regular expressions rules are expressed in a file. The file of rules
- * is provided by each implementation using the
- * {@link #getRulesFile(Configuration)} method.</p>
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
+ * expressions.
+ *
+ * <p>
+ * The regular expressions rules are expressed in a file. The file of rules is
+ * provided by each implementation using the
+ * {@link #getRulesFile(Configuration)} method.
+ * </p>
*
- * <p>The format of this file is made of many rules (one per line):<br/>
+ * <p>
+ * The format of this file is made of many rules (one per line):<br/>
* <code>
* [+-]<regex>
* </code><br/>
- * where plus (<code>+</code>)means go ahead and index it and minus
- * (<code>-</code>)means no.</p>
-
+ * where plus (<code>+</code>)means go ahead and index it and minus (
+ * <code>-</code>)means no.
+ * </p>
*/
public abstract class RegexURLFilterBase implements URLFilter {
/** My logger */
- private final static Logger LOG = LoggerFactory.getLogger(RegexURLFilterBase.class);
+ private final static Logger LOG = LoggerFactory
+ .getLogger(RegexURLFilterBase.class);
/** An array of applicable rules */
private List<RegexRule> rules;
@@ -65,24 +68,28 @@ public abstract class RegexURLFilterBase
/** The current configuration */
private Configuration conf;
-
/**
* Constructs a new empty RegexURLFilterBase
*/
- public RegexURLFilterBase() { }
+ public RegexURLFilterBase() {
+ }
/**
* Constructs a new RegexURLFilter and init it with a file of rules.
- * @param filename is the name of rules file.
+ *
+ * @param filename
+ * is the name of rules file.
*/
- public RegexURLFilterBase(File filename)
- throws IOException, IllegalArgumentException {
+ public RegexURLFilterBase(File filename) throws IOException,
+ IllegalArgumentException {
this(new FileReader(filename));
}
-
+
/**
* Constructs a new RegexURLFilter and inits it with a list of rules.
- * @param rules string with a list of rules, one rule per line
+ *
+ * @param rules
+ * string with a list of rules, one rule per line
* @throws IOException
* @throws IllegalArgumentException
*/
@@ -93,68 +100,82 @@ public abstract class RegexURLFilterBase
/**
* Constructs a new RegexURLFilter and init it with a Reader of rules.
- * @param reader is a reader of rules.
+ *
+ * @param reader
+ * is a reader of rules.
*/
- protected RegexURLFilterBase(Reader reader)
- throws IOException, IllegalArgumentException {
+ protected RegexURLFilterBase(Reader reader) throws IOException,
+ IllegalArgumentException {
rules = readRules(reader);
}
-
+
/**
* Creates a new {@link RegexRule}.
- * @param sign of the regular expression.
- * A <code>true</code> value means that any URL matching this rule
- * must be included, whereas a <code>false</code>
- * value means that any URL matching this rule must be excluded.
- * @param regex is the regular expression associated to this rule.
+ *
+ * @param sign
+ * of the regular expression. A <code>true</code> value means that
+ * any URL matching this rule must be included, whereas a
+ * <code>false</code> value means that any URL matching this rule
+ * must be excluded.
+ * @param regex
+ * is the regular expression associated to this rule.
*/
protected abstract RegexRule createRule(boolean sign, String regex);
-
+
/**
- * Returns the name of the file of rules to use for
- * a particular implementation.
- * @param conf is the current configuration.
+ * Returns the name of the file of rules to use for a particular
+ * implementation.
+ *
+ * @param conf
+ * is the current configuration.
* @return the name of the resource containing the rules to use.
*/
- protected abstract Reader getRulesReader(Configuration conf) throws IOException;
-
-
- /* -------------------------- *
- * <implementation:URLFilter> *
- * -------------------------- */
-
+ protected abstract Reader getRulesReader(Configuration conf)
+ throws IOException;
+
+ /*
+ * -------------------------- * <implementation:URLFilter> *
+ * --------------------------
+ */
+
// Inherited Javadoc
public String filter(String url) {
for (RegexRule rule : rules) {
if (rule.match(url)) {
return rule.accept() ? url : null;
}
- };
+ }
+ ;
return null;
}
- /* --------------------------- *
- * </implementation:URLFilter> *
- * --------------------------- */
-
-
- /* ----------------------------- *
- * <implementation:Configurable> *
- * ----------------------------- */
-
+ /*
+ * --------------------------- * </implementation:URLFilter> *
+ * ---------------------------
+ */
+
+ /*
+ * ----------------------------- * <implementation:Configurable> *
+ * -----------------------------
+ */
+
public void setConf(Configuration conf) {
this.conf = conf;
Reader reader = null;
try {
reader = getRulesReader(conf);
} catch (Exception e) {
- if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
- throw new RuntimeException(e.getMessage(), e);
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
+ throw new RuntimeException(e.getMessage(), e);
}
try {
rules = readRules(reader);
} catch (IOException e) {
- if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
throw new RuntimeException(e.getMessage(), e);
}
}
@@ -162,45 +183,51 @@ public abstract class RegexURLFilterBase
public Configuration getConf() {
return this.conf;
}
-
- /* ------------------------------ *
- * </implementation:Configurable> *
- * ------------------------------ */
-
+
+ /*
+ * ------------------------------ * </implementation:Configurable> *
+ * ------------------------------
+ */
/**
* Read the specified file of rules.
- * @param reader is a reader of regular expressions rules.
+ *
+ * @param reader
+ * is a reader of regular expressions rules.
* @return the corresponding {@RegexRule rules}.
*/
- private List<RegexRule> readRules(Reader reader)
- throws IOException, IllegalArgumentException {
+ private List<RegexRule> readRules(Reader reader) throws IOException,
+ IllegalArgumentException {
BufferedReader in = new BufferedReader(reader);
List<RegexRule> rules = new ArrayList<RegexRule>();
String line;
-
- while((line=in.readLine())!=null) {
+
+ while ((line = in.readLine()) != null) {
if (line.length() == 0) {
continue;
}
- char first=line.charAt(0);
- boolean sign=false;
+ char first = line.charAt(0);
+ boolean sign = false;
switch (first) {
- case '+' :
- sign=true;
+ case '+':
+ sign = true;
break;
- case '-' :
- sign=false;
+ case '-':
+ sign = false;
break;
- case ' ' : case '\n' : case '#' : // skip blank & comment lines
+ case ' ':
+ case '\n':
+ case '#': // skip blank & comment lines
continue;
- default :
- throw new IOException("Invalid first character: "+line);
+ default:
+ throw new IOException("Invalid first character: " + line);
}
String regex = line.substring(1);
- if (LOG.isTraceEnabled()) { LOG.trace("Adding rule [" + regex + "]"); }
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Adding rule [" + regex + "]");
+ }
RegexRule rule = createRule(sign, regex);
rules.add(rule);
}
@@ -209,18 +236,20 @@ public abstract class RegexURLFilterBase
/**
* Filter the standard input using a RegexURLFilterBase.
- * @param filter is the RegexURLFilterBase to use for filtering the
- * standard input.
- * @param args some optional parameters (not used).
+ *
+ * @param filter
+ * is the RegexURLFilterBase to use for filtering the standard input.
+ * @param args
+ * some optional parameters (not used).
*/
public static void main(RegexURLFilterBase filter, String args[])
- throws IOException, IllegalArgumentException {
+ throws IOException, IllegalArgumentException {
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
- while((line=in.readLine())!=null) {
+ while ((line = in.readLine()) != null) {
String out = filter.filter(line);
- if (out!=null) {
+ if (out != null) {
System.out.print("+");
System.out.println(out);
} else {
Modified: nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java (original)
+++ nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java Fri Jan 9 06:34:33 2015
@@ -42,52 +42,52 @@ import org.apache.nutch.net.URLFilter;
* JUnit based test of class <code>RegexURLFilterBase</code>.
*/
-//@RunWith(Suite.class)
-//@Suite.SuiteClasses({TestAutomatonURLFilter.class, TestRegexURLFilter.class})
+// @RunWith(Suite.class)
+// @Suite.SuiteClasses({TestAutomatonURLFilter.class, TestRegexURLFilter.class})
public abstract class RegexURLFilterBaseTest {
-
+
/** My logger */
- protected static final Logger LOG = LoggerFactory.getLogger(RegexURLFilterBaseTest.class);
+ protected static final Logger LOG = LoggerFactory
+ .getLogger(RegexURLFilterBaseTest.class);
- private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SEPARATOR = System.getProperty("file.separator");
private final static String SAMPLES = System.getProperty("test.data", ".");
-
+
protected abstract URLFilter getURLFilter(Reader rules);
protected void bench(int loops, String file) {
try {
- bench(loops,
- new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
- new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+ bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+ new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
} catch (Exception e) {
fail(e.toString());
}
}
-
+
protected void bench(int loops, Reader rules, Reader urls) {
long start = System.currentTimeMillis();
try {
URLFilter filter = getURLFilter(rules);
FilteredURL[] expected = readURLFile(urls);
- for (int i=0; i<loops; i++) {
+ for (int i = 0; i < loops; i++) {
test(filter, expected);
}
} catch (Exception e) {
fail(e.toString());
}
- LOG.info("bench time (" + loops + ") " +
- (System.currentTimeMillis()-start) + "ms");
+ LOG.info("bench time (" + loops + ") "
+ + (System.currentTimeMillis() - start) + "ms");
}
-
+
protected void test(String file) {
try {
test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
- new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+ new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
} catch (Exception e) {
fail(e.toString());
}
}
-
+
protected void test(Reader rules, Reader urls) {
try {
test(getURLFilter(rules), readURLFile(urls));
@@ -95,9 +95,9 @@ public abstract class RegexURLFilterBase
fail(e.toString());
}
}
-
+
protected void test(URLFilter filter, FilteredURL[] expected) {
- for (int i=0; i<expected.length; i++) {
+ for (int i = 0; i < expected.length; i++) {
String result = filter.filter(expected[i].url);
if (result != null) {
assertTrue(expected[i].url, expected[i].sign);
@@ -106,37 +106,37 @@ public abstract class RegexURLFilterBase
}
}
}
-
+
private static FilteredURL[] readURLFile(Reader reader) throws IOException {
BufferedReader in = new BufferedReader(reader);
List<FilteredURL> list = new ArrayList<FilteredURL>();
String line;
- while((line=in.readLine()) != null) {
+ while ((line = in.readLine()) != null) {
if (line.length() != 0) {
list.add(new FilteredURL(line));
}
}
return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]);
}
-
+
private static class FilteredURL {
-
+
boolean sign;
String url;
FilteredURL(String line) {
switch (line.charAt(0)) {
- case '+' :
+ case '+':
sign = true;
break;
- case '-' :
+ case '-':
sign = false;
break;
- default :
+ default:
// Simply ignore...
}
url = line.substring(1);
}
}
-
+
}
Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java Fri Jan 9 06:34:33 2015
@@ -39,7 +39,7 @@ import org.apache.nutch.util.Bytes;
* @author Jérôme Charron
*/
public class RelTagIndexingFilter implements IndexingFilter {
-
+
private Configuration conf;
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
@@ -50,10 +50,9 @@ public class RelTagIndexingFilter implem
}
/**
- * Gets all the fields for a given {@link WebPage}
- * Many datastores need to setup the mapreduce job by specifying the fields
- * needed. All extensions that work on WebPage are able to specify what fields
- * they need.
+ * Gets all the fields for a given {@link WebPage} Many datastores need to
+ * setup the mapreduce job by specifying the fields needed. All extensions
+ * that work on WebPage are able to specify what fields they need.
*/
@Override
public Collection<Field> getFields() {
@@ -73,24 +72,28 @@ public class RelTagIndexingFilter implem
public Configuration getConf() {
return this.conf;
}
-
+
/**
* The {@link RelTagIndexingFilter} filter object.
- *
- * @param doc The {@link NutchDocument} object
- * @param url URL to be filtered for rel-tag's
- * @param page {@link WebPage} object relative to the URL
+ *
+ * @param doc
+ * The {@link NutchDocument} object
+ * @param url
+ * URL to be filtered for rel-tag's
+ * @param page
+ * {@link WebPage} object relative to the URL
* @return filtered NutchDocument
*/
@Override
- public NutchDocument filter(NutchDocument doc, String url, WebPage page) throws IndexingException {
- // Check if some Rel-Tags found, possibly put there by RelTagParser
+ public NutchDocument filter(NutchDocument doc, String url, WebPage page)
+ throws IndexingException {
+ // Check if some Rel-Tags found, possibly put there by RelTagParser
ByteBuffer bb = page.getMetadata().get(new Utf8(RelTagParser.REL_TAG));
-
+
if (bb != null) {
String[] tags = Bytes.toString(bb).split("\t");
for (int i = 0; i < tags.length; i++) {
- doc.add("tag", tags[i]);
+ doc.add("tag", tags[i]);
}
}
return doc;
Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Fri Jan 9 06:34:33 2015
@@ -74,26 +74,26 @@ public class RelTagParser implements Par
if (node.getNodeType() == Node.ELEMENT_NODE) {
// Look for <a> tag
if ("a".equalsIgnoreCase(node.getNodeName())) {
- NamedNodeMap attrs = node.getAttributes();
- Node hrefNode = attrs.getNamedItem("href");
- // Checks that it contains a href attribute
- if (hrefNode != null) {
- Node relNode = attrs.getNamedItem("rel");
- // Checks that it contains a rel attribute too
- if (relNode != null) {
- // Finaly checks that rel=tag
- if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
- String tag = parseTag(hrefNode.getNodeValue());
- if (!StringUtil.isEmpty(tag)) {
- if(!tags.contains(tag)){
+ NamedNodeMap attrs = node.getAttributes();
+ Node hrefNode = attrs.getNamedItem("href");
+ // Checks that it contains a href attribute
+ if (hrefNode != null) {
+ Node relNode = attrs.getNamedItem("rel");
+ // Checks that it contains a rel attribute too
+ if (relNode != null) {
+ // Finaly checks that rel=tag
+ if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
+ String tag = parseTag(hrefNode.getNodeValue());
+ if (!StringUtil.isEmpty(tag)) {
+ if (!tags.contains(tag)) {
tags.add(tag);
- LOG.debug("Adding tag: " + tag + " to tag set.");
+ LOG.debug("Adding tag: " + tag + " to tag set.");
}
- }
- }
- }
- }
- }
+ }
+ }
+ }
+ }
+ }
}
// Recurse
@@ -108,11 +108,13 @@ public class RelTagParser implements Par
try {
URL u = new URL(url);
String path = u.getPath();
- tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), "UTF-8");
+ tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1),
+ "UTF-8");
} catch (Exception e) {
// Malformed tag...
tag = null;
- } return tag;
+ }
+ return tag;
}
}
@@ -136,12 +138,11 @@ public class RelTagParser implements Par
FIELDS.add(WebPage.Field.BASE_URL);
FIELDS.add(WebPage.Field.METADATA);
}
-
+
/**
- * Gets all the fields for a given {@link WebPage}
- * Many datastores need to setup the mapreduce job by specifying the fields
- * needed. All extensions that work on WebPage are able to specify what fields
- * they need.
+ * Gets all the fields for a given {@link WebPage} Many datastores need to
+ * setup the mapreduce job by specifying the fields needed. All extensions
+ * that work on WebPage are able to specify what fields they need.
*/
@Override
public Collection<Field> getFields() {
Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java Fri Jan 9 06:34:33 2015
@@ -28,13 +28,13 @@ import java.nio.ByteBuffer;
import static org.junit.Assert.*;
/**
- *JUnit test case for {@link RelTagIndexingFilter} which
- *simply asserts that a 'tag' field is obtained by the filter.
- *
- *@author lewismc
+ * JUnit test case for {@link RelTagIndexingFilter} which simply asserts that a
+ * 'tag' field is obtained by the filter.
+ *
+ * @author lewismc
*/
- public class TestRelTagIndexingFilter {
+public class TestRelTagIndexingFilter {
@Test
public void testRelTagFields() throws Exception {
@@ -57,4 +57,3 @@ import static org.junit.Assert.*;
assertTrue("check for 'tag' field", doc.getFieldNames().contains("tag"));
}
}
-
\ No newline at end of file
Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java Fri Jan 9 06:34:33 2015
@@ -38,15 +38,15 @@ import java.nio.ByteBuffer;
import static org.junit.Assert.assertEquals;
/**
- * Junit test for {@link RelTagParser} based mainly John Xing's parser tests.
- * We are not concerned with actual parse text within the sample file, instead
- * we assert that the rel-tags we expect are found in the WebPage metadata.
- * To check the parser is working as expected we unwrap the ByteBuffer obtained
- * from metadata, the same type as we use in expected (String). So just the
+ * Junit test for {@link RelTagParser} based mainly John Xing's parser tests. We
+ * are not concerned with actual parse text within the sample file, instead we
+ * assert that the rel-tags we expect are found in the WebPage metadata. To
+ * check the parser is working as expected we unwrap the ByteBuffer obtained
+ * from metadata, the same type as we use in expected (String). So just the
* other way around as we wrapped the metadata value.
*
* @author lewismc
- *
+ *
*/
public class TestRelTagParser {
@@ -58,14 +58,15 @@ public class TestRelTagParser {
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/microformats-reltag/build.xml during plugin compilation.
private String sampleFile = "microformats_reltag_test.html";
-
+
// rel-tag's we expect to be extracted from page.getMetadata()
private String expectedRelTags = "Category:Specifications Category:rel-tag ";
-
+
private Configuration conf;
-
+
@Test
- public void testRelTagParser() throws ParseException, ProtocolException, IOException {
+ public void testRelTagParser() throws ParseException, ProtocolException,
+ IOException {
conf = NutchConfiguration.create();
conf.set("file.content.limit", "-1");
@SuppressWarnings("unused")
@@ -85,14 +86,14 @@ public class TestRelTagParser {
String mtype = mimeutil.getMimeType(file);
page.setContentType(new Utf8(mtype));
parse = new ParseUtil(conf).parse(urlString, page);
- //begin assertion for tests
+ // begin assertion for tests
ByteBuffer bbuf = page.getMetadata().get(new Utf8("Rel-Tag"));
byte[] byteArray = new byte[bbuf.remaining()];
bbuf.get(byteArray);
String s = new String(byteArray);
- //bbuf.flip();
- assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter",
- expectedRelTags, s);
+ // bbuf.flip();
+ assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter",
+ expectedRelTags, s);
}
-
+
}
\ No newline at end of file