You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/09 07:34:37 UTC

svn commit: r1650447 [15/25] - in /nutch/branches/2.x: ./ src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/ src/java/org/apache/nutch/api/impl/db/ src/java/org/apache/nutch/api/model/response/ src/java/org/apache/nutch/api/resources/ s...

Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Fri Jan  9 06:34:33 2015
@@ -44,7 +44,7 @@ import org.apache.nutch.util.MimeUtil;
 import crawlercommons.robots.BaseRobotRules;
 
 public abstract class HttpBase implements Protocol {
-  
+
   private final static Utf8 RESPONSE_TIME = new Utf8("_rs_");
 
   public static final int BUFFER_SIZE = 8 * 1024;
@@ -69,15 +69,12 @@ public abstract class HttpBase implement
   protected int maxContent = 64 * 1024;
 
   /** The Nutch 'User-Agent' request header */
-  protected String userAgent = getAgentString(
-      "NutchCVS", null, "Nutch",
-      "http://nutch.apache.org/bot.html",
-      "agent@nutch.apache.org");
-
+  protected String userAgent = getAgentString("NutchCVS", null, "Nutch",
+      "http://nutch.apache.org/bot.html", "agent@nutch.apache.org");
 
   /** The "Accept-Language" request header value. */
   protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
-  
+
   /** The "Accept" request header value. */
   protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
 
@@ -97,13 +94,13 @@ public abstract class HttpBase implement
 
   /** Response Time */
   protected boolean responseTime = true;
-  
+
   /** Which TLS/SSL protocols to support */
   protected Set<String> tlsPreferredProtocols;
-  
+
   /** Which TLS/SSL cipher suites to support */
   protected Set<String> tlsPreferredCipherSuites;
-  
+
   /** Creates a new instance of HttpBase */
   public HttpBase() {
     this(null);
@@ -125,37 +122,62 @@ public abstract class HttpBase implement
     this.useProxy = (proxyHost != null && proxyHost.length() > 0);
     this.timeout = conf.getInt("http.timeout", 10000);
     this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
-    this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf
-        .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
+    this.userAgent = getAgentString(conf.get("http.agent.name"),
+        conf.get("http.agent.version"), conf.get("http.agent.description"),
+        conf.get("http.agent.url"), conf.get("http.agent.email"));
     this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
     this.accept = conf.get("http.accept", accept);
     this.mimeTypes = new MimeUtil(conf);
     this.useHttp11 = conf.getBoolean("http.useHttp11", false);
     this.responseTime = conf.getBoolean("http.store.responsetime", true);
     this.robots.setConf(conf);
-    
-    String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
-    String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", 
-        "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
-        "TLS_RSA_WITH_AES_256_CBC_SHA256","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
-        "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256","TLS_DHE_DSS_WITH_AES_256_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
-        "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA","TLS_RSA_WITH_AES_256_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
-        "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
-        "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256","TLS_RSA_WITH_AES_128_CBC_SHA256",
-        "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256","TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
-        "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA",
-        "TLS_RSA_WITH_AES_128_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
-        "TLS_DHE_RSA_WITH_AES_128_CBC_SHA","TLS_DHE_DSS_WITH_AES_128_CBC_SHA","TLS_ECDHE_ECDSA_WITH_RC4_128_SHA",
-        "TLS_ECDHE_RSA_WITH_RC4_128_SHA","SSL_RSA_WITH_RC4_128_SHA","TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
-        "TLS_ECDH_RSA_WITH_RC4_128_SHA","TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA",
-        "SSL_RSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
-        "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA","SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA","SSL_RSA_WITH_RC4_128_MD5",
-        "TLS_EMPTY_RENEGOTIATION_INFO_SCSV","TLS_RSA_WITH_NULL_SHA256","TLS_ECDHE_ECDSA_WITH_NULL_SHA",
-        "TLS_ECDHE_RSA_WITH_NULL_SHA","SSL_RSA_WITH_NULL_SHA","TLS_ECDH_ECDSA_WITH_NULL_SHA","TLS_ECDH_RSA_WITH_NULL_SHA",
-        "SSL_RSA_WITH_NULL_MD5","SSL_RSA_WITH_DES_CBC_SHA","SSL_DHE_RSA_WITH_DES_CBC_SHA","SSL_DHE_DSS_WITH_DES_CBC_SHA",
-        "TLS_KRB5_WITH_RC4_128_SHA","TLS_KRB5_WITH_RC4_128_MD5","TLS_KRB5_WITH_3DES_EDE_CBC_SHA","TLS_KRB5_WITH_3DES_EDE_CBC_MD5",
-        "TLS_KRB5_WITH_DES_CBC_SHA","TLS_KRB5_WITH_DES_CBC_MD5");
-    
+
+    String[] protocols = conf.getStrings("http.tls.supported.protocols",
+        "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
+    String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
+        "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
+        "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
+        "TLS_RSA_WITH_AES_256_CBC_SHA256",
+        "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
+        "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
+        "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
+        "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256",
+        "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
+        "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
+        "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA",
+        "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
+        "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
+        "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
+        "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
+        "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA",
+        "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA",
+        "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
+        "TLS_ECDH_RSA_WITH_RC4_128_SHA",
+        "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA",
+        "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA",
+        "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
+        "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA",
+        "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5",
+        "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256",
+        "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA",
+        "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA",
+        "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5",
+        "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA",
+        "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
+        "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA",
+        "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA",
+        "TLS_KRB5_WITH_DES_CBC_MD5");
+
     tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
     tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
 
@@ -171,81 +193,90 @@ public abstract class HttpBase implement
 
     try {
       URL u = new URL(url);
-      
+
       long startTime = System.currentTimeMillis();
       Response response = getResponse(u, page, false); // make a request
-      int elapsedTime =(int) (System.currentTimeMillis() - startTime);
-      
-      if(this.responseTime) {
-        page.getMetadata().put(RESPONSE_TIME, ByteBuffer.wrap(Bytes.toBytes(elapsedTime)));
+      int elapsedTime = (int) (System.currentTimeMillis() - startTime);
+
+      if (this.responseTime) {
+        page.getMetadata().put(RESPONSE_TIME,
+            ByteBuffer.wrap(Bytes.toBytes(elapsedTime)));
       }
-      
+
       int code = response.getCode();
       byte[] content = response.getContent();
       Content c = new Content(u.toString(), u.toString(),
           (content == null ? EMPTY_CONTENT : content),
-          response.getHeader("Content-Type"),
-          response.getHeaders(), mimeTypes);
+          response.getHeader("Content-Type"), response.getHeaders(), mimeTypes);
 
       if (code == 200) { // got a good response
         return new ProtocolOutput(c); // return it
       } else if (code >= 300 && code < 400) { // handle redirect
         String location = response.getHeader("Location");
         // some broken servers, such as MS IIS, use lowercase header name...
-        if (location == null) location = response.getHeader("location");
-        if (location == null) location = "";
+        if (location == null)
+          location = response.getHeader("location");
+        if (location == null)
+          location = "";
         u = new URL(u, location);
         int protocolStatusCode;
         switch (code) {
-        case 300:   // multiple choices, preferred value in Location
+        case 300: // multiple choices, preferred value in Location
           protocolStatusCode = ProtocolStatusCodes.MOVED;
           break;
-        case 301:   // moved permanently
-        case 305:   // use proxy (Location is URL of proxy)
+        case 301: // moved permanently
+        case 305: // use proxy (Location is URL of proxy)
           protocolStatusCode = ProtocolStatusCodes.MOVED;
           break;
-        case 302:   // found (temporarily moved)
-        case 303:   // see other (redirect after POST)
-        case 307:   // temporary redirect
+        case 302: // found (temporarily moved)
+        case 303: // see other (redirect after POST)
+        case 307: // temporary redirect
           protocolStatusCode = ProtocolStatusUtils.TEMP_MOVED;
           break;
-        case 304:   // not modified
+        case 304: // not modified
           protocolStatusCode = ProtocolStatusUtils.NOTMODIFIED;
           break;
         default:
           protocolStatusCode = ProtocolStatusUtils.MOVED;
         }
         // handle this in the higher layer.
-        return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(protocolStatusCode, u));
+        return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(
+            protocolStatusCode, u));
       } else if (code == 400) { // bad request, mark as GONE
-        if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); }
-        return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, u));
-      } else if (code == 401) { // requires authorization, but no valid auth provided.
-        if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); }
-        return new ProtocolOutput(c,
-            ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.ACCESS_DENIED,
-                "Authentication required: "+ url));
+        if (logger.isTraceEnabled()) {
+          logger.trace("400 Bad request: " + u);
+        }
+        return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(
+            ProtocolStatusCodes.GONE, u));
+      } else if (code == 401) { // requires authorization, but no valid auth
+                                // provided.
+        if (logger.isTraceEnabled()) {
+          logger.trace("401 Authentication Required");
+        }
+        return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(
+            ProtocolStatusCodes.ACCESS_DENIED, "Authentication required: "
+                + url));
       } else if (code == 404) {
-        return new ProtocolOutput(c,
-            ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.NOTFOUND, u));
+        return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(
+            ProtocolStatusCodes.NOTFOUND, u));
       } else if (code == 410) { // permanently GONE
-        return new ProtocolOutput(c,
-            ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, "Http: " + code + " url=" + u));
+        return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(
+            ProtocolStatusCodes.GONE, "Http: " + code + " url=" + u));
       } else {
-        return new ProtocolOutput(c,
-            ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.EXCEPTION, "Http code=" + code + ", url="
-                + u));
+        return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(
+            ProtocolStatusCodes.EXCEPTION, "Http code=" + code + ", url=" + u));
       }
     } catch (Throwable e) {
       logger.error("Failed with the following error: ", e);
-      return new ProtocolOutput(null,
-          ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.EXCEPTION, e.toString()));
+      return new ProtocolOutput(null, ProtocolStatusUtils.makeStatus(
+          ProtocolStatusCodes.EXCEPTION, e.toString()));
     }
   }
 
-  /* -------------------------- *
-   * </implementation:Protocol> *
-   * -------------------------- */
+  /*
+   * -------------------------- * </implementation:Protocol> *
+   * --------------------------
+   */
   public String getProxyHost() {
     return proxyHost;
   }
@@ -269,58 +300,57 @@ public abstract class HttpBase implement
   public String getUserAgent() {
     return userAgent;
   }
-  
-  /** Value of "Accept-Language" request header sent by Nutch.
+
+  /**
+   * Value of "Accept-Language" request header sent by Nutch.
+   * 
    * @return The value of the header "Accept-Language" header.
    */
   public String getAcceptLanguage() {
-         return acceptLanguage;
+    return acceptLanguage;
   }
 
   public String getAccept() {
-         return accept;
+    return accept;
   }
 
   public boolean getUseHttp11() {
     return useHttp11;
   }
-  
+
   public Set<String> getTlsPreferredCipherSuites() {
     return tlsPreferredCipherSuites;
   }
-  
+
   public Set<String> getTlsPreferredProtocols() {
     return tlsPreferredProtocols;
   }
 
-  private static String getAgentString(String agentName,
-      String agentVersion,
-      String agentDesc,
-      String agentURL,
-      String agentEmail) {
+  private static String getAgentString(String agentName, String agentVersion,
+      String agentDesc, String agentURL, String agentEmail) {
 
-    if ( (agentName == null) || (agentName.trim().length() == 0) ) {
+    if ((agentName == null) || (agentName.trim().length() == 0)) {
       // TODO : NUTCH-258
       if (LOGGER.isErrorEnabled()) {
         LOGGER.error("No User-Agent string set (http.agent.name)!");
       }
     }
 
-    StringBuffer buf= new StringBuffer();
+    StringBuffer buf = new StringBuffer();
 
     buf.append(agentName);
     if (agentVersion != null) {
       buf.append("/");
       buf.append(agentVersion);
     }
-    if ( ((agentDesc != null) && (agentDesc.length() != 0))
+    if (((agentDesc != null) && (agentDesc.length() != 0))
         || ((agentEmail != null) && (agentEmail.length() != 0))
-        || ((agentURL != null) && (agentURL.length() != 0)) ) {
+        || ((agentURL != null) && (agentURL.length() != 0))) {
       buf.append(" (");
 
       if ((agentDesc != null) && (agentDesc.length() != 0)) {
         buf.append(agentDesc);
-        if ( (agentURL != null) || (agentEmail != null) )
+        if ((agentURL != null) || (agentEmail != null))
           buf.append("; ");
       }
 
@@ -350,9 +380,12 @@ public abstract class HttpBase implement
     }
   }
 
-  public byte[] processGzipEncoded(byte[] compressed, URL url) throws IOException {
+  public byte[] processGzipEncoded(byte[] compressed, URL url)
+      throws IOException {
 
-    if (LOGGER.isTraceEnabled()) { LOGGER.trace("uncompressing...."); }
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("uncompressing....");
+    }
 
     byte[] content;
     if (getMaxContent() >= 0) {
@@ -366,25 +399,29 @@ public abstract class HttpBase implement
 
     if (LOGGER.isTraceEnabled()) {
       LOGGER.trace("fetched " + compressed.length
-          + " bytes of compressed content (expanded to "
-          + content.length + " bytes) from " + url);
+          + " bytes of compressed content (expanded to " + content.length
+          + " bytes) from " + url);
     }
     return content;
   }
 
-  public byte[] processDeflateEncoded(byte[] compressed, URL url) throws IOException {
+  public byte[] processDeflateEncoded(byte[] compressed, URL url)
+      throws IOException {
 
-    if (LOGGER.isTraceEnabled()) { LOGGER.trace("inflating...."); }
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("inflating....");
+    }
 
-    byte[] content = DeflateUtils.inflateBestEffort(compressed, getMaxContent());
+    byte[] content = DeflateUtils
+        .inflateBestEffort(compressed, getMaxContent());
 
     if (content == null)
       throw new IOException("inflateBestEffort returned null");
 
     if (LOGGER.isTraceEnabled()) {
       LOGGER.trace("fetched " + compressed.length
-                 + " bytes of compressed content (expanded to "
-                 + content.length + " bytes) from " + url);
+          + " bytes of compressed content (expanded to " + content.length
+          + " bytes) from " + url);
     }
     return content;
   }
@@ -409,27 +446,28 @@ public abstract class HttpBase implement
       } else if (i != args.length - 1) {
         System.err.println(usage);
         System.exit(-1);
-      } else // root is required parameter
+      } else
+        // root is required parameter
         url = args[i];
     }
 
-    ProtocolOutput out = http.getProtocolOutput(url, WebPage.newBuilder().build());
+    ProtocolOutput out = http.getProtocolOutput(url, WebPage.newBuilder()
+        .build());
     Content content = out.getContent();
 
     System.out.println("Status: " + out.getStatus());
     if (content != null) {
       System.out.println("Content Type: " + content.getContentType());
-      System.out.println("Content Length: " +
-          content.getMetadata().get(Response.CONTENT_LENGTH));
+      System.out.println("Content Length: "
+          + content.getMetadata().get(Response.CONTENT_LENGTH));
       System.out.println("Content:");
       String text = new String(content.getContent());
       System.out.println(text);
     }
   }
 
-  protected abstract Response getResponse(URL url,
-      WebPage page, boolean followRedirects)
-  throws ProtocolException, IOException;
+  protected abstract Response getResponse(URL url, WebPage page,
+      boolean followRedirects) throws ProtocolException, IOException;
 
   @Override
   public BaseRobotRules getRobotRules(String url, WebPage page) {

Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java Fri Jan  9 06:34:33 2015
@@ -19,7 +19,6 @@ package org.apache.nutch.protocol.http.a
 // Nutch imports
 import org.apache.nutch.protocol.ProtocolException;
 
-
 public class HttpException extends ProtocolException {
 
   public HttpException() {

Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Fri Jan  9 06:34:33 2015
@@ -30,16 +30,18 @@ import org.slf4j.LoggerFactory;
 import java.net.URL;
 
 /**
- * This class is used for parsing robots for urls belonging to HTTP protocol.
- * It extends the generic {@link RobotRulesParser} class and contains 
- * Http protocol specific implementation for obtaining the robots file.
+ * This class is used for parsing robots for urls belonging to HTTP protocol. It
+ * extends the generic {@link RobotRulesParser} class and contains Http protocol
+ * specific implementation for obtaining the robots file.
  */
 public class HttpRobotRulesParser extends RobotRulesParser {
-  
-  public static final Logger LOG = LoggerFactory.getLogger(HttpRobotRulesParser.class);
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(HttpRobotRulesParser.class);
   protected boolean allowForbidden = false;
 
-  HttpRobotRulesParser() { }
+  HttpRobotRulesParser() {
+  }
 
   public HttpRobotRulesParser(Configuration conf) {
     super(conf);
@@ -48,14 +50,17 @@ public class HttpRobotRulesParser extend
 
   /** Compose unique key to store and access robot rules in cache for given URL */
   protected static String getCacheKey(URL url) {
-    String protocol = url.getProtocol().toLowerCase();  // normalize to lower case
-    String host = url.getHost().toLowerCase();          // normalize to lower case
+    String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+                                                       // case
+    String host = url.getHost().toLowerCase(); // normalize to lower case
     int port = url.getPort();
     if (port == -1) {
       port = url.getDefaultPort();
     }
-   /* Robot rules apply only to host, protocol, and port where robots.txt is
-    * hosted (cf. NUTCH-1752). Consequently  */
+    /*
+     * Robot rules apply only to host, protocol, and port where robots.txt is
+     * hosted (cf. NUTCH-1752). Consequently
+     */
     String cacheKey = protocol + ":" + host + ":" + port;
     return cacheKey;
   }
@@ -71,7 +76,7 @@ public class HttpRobotRulesParser extend
    *          The {@link Protocol} object
    * @param url
    *          URL robots.txt applies to
-   *
+   * 
    * @return {@link BaseRobotRules} holding the rules from robots.txt
    */
   public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
@@ -80,13 +85,15 @@ public class HttpRobotRulesParser extend
     BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);
 
     boolean cacheRule = true;
-    
-    if (robotRules == null) {                     // cache miss
+
+    if (robotRules == null) { // cache miss
       URL redir = null;
-      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("cache miss " + url);
+      }
       try {
-        Response response = ((HttpBase)http).getResponse(new URL(url, "/robots.txt"),
-                                             WebPage.newBuilder().build(), true);
+        Response response = ((HttpBase) http).getResponse(new URL(url,
+            "/robots.txt"), WebPage.newBuilder().build(), true);
         // try one level of redirection ?
         if (response.getCode() == 301 || response.getCode() == 302) {
           String redirection = response.getHeader("Location");
@@ -101,23 +108,23 @@ public class HttpRobotRulesParser extend
             } else {
               redir = new URL(redirection);
             }
-            
-            response = ((HttpBase)http).getResponse(redir, WebPage.newBuilder().build(), true);
+
+            response = ((HttpBase) http).getResponse(redir, WebPage
+                .newBuilder().build(), true);
           }
         }
 
-        if (response.getCode() == 200)               // found rules: parse them
-          robotRules =  parseRules(url.toString(), response.getContent(), 
-                                   response.getHeader("Content-Type"), 
-                                   agentNames);
+        if (response.getCode() == 200) // found rules: parse them
+          robotRules = parseRules(url.toString(), response.getContent(),
+              response.getHeader("Content-Type"), agentNames);
 
-        else if ( (response.getCode() == 403) && (!allowForbidden) )
-          robotRules = FORBID_ALL_RULES;            // use forbid all
+        else if ((response.getCode() == 403) && (!allowForbidden))
+          robotRules = FORBID_ALL_RULES; // use forbid all
         else if (response.getCode() >= 500) {
           cacheRule = false;
           robotRules = EMPTY_RULES;
-        }else                                        
-          robotRules = EMPTY_RULES;                 // use default rules
+        } else
+          robotRules = EMPTY_RULES; // use default rules
       } catch (Throwable t) {
         if (LOG.isInfoEnabled()) {
           LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
@@ -127,7 +134,7 @@ public class HttpRobotRulesParser extend
       }
 
       if (cacheRule) {
-        CACHE.put(cacheKey, robotRules);  // cache rules for host
+        CACHE.put(cacheKey, robotRules); // cache rules for host
         if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
           // cache also for the redirected host
           CACHE.put(getCacheKey(redir), robotRules);

Modified: nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Fri Jan  9 06:34:33 2015
@@ -25,10 +25,10 @@ import crawlercommons.robots.BaseRobotRu
 import static org.junit.Assert.*;
 
 /**
- * JUnit test case which tests
- * 1. that robots filtering is performed correctly as per the agent name
- * 2. that crawl delay is extracted correctly from the robots file
- *
+ * JUnit test case which tests 1. that robots filtering is performed correctly
+ * as per the agent name 2. that crawl delay is extracted correctly from the
+ * robots file
+ * 
  */
 public class TestRobotRulesParser {
 
@@ -37,40 +37,33 @@ public class TestRobotRulesParser {
   private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
   private static final String UNKNOWN_AGENT = "AgentABC";
   private static final String CR = "\r";
-  
-  private static final String ROBOTS_STRING = 
-      "User-Agent: Agent1 #foo" + CR 
-      + "Disallow: /a" + CR 
-      + "Disallow: /b/a" + CR 
-      + "#Disallow: /c" + CR 
-      + "Crawl-delay: 10" + CR  // set crawl delay for Agent1 as 10 sec
-      + "" + CR 
-      + "" + CR 
-      + "User-Agent: Agent2" + CR 
-      + "Disallow: /a/bloh" + CR 
-      + "Disallow: /c" + CR
-      + "Disallow: /foo" + CR
-      + "Crawl-delay: 20" + CR
-      + "" + CR 
-      + "User-Agent: *" + CR 
-      + "Disallow: /foo/bar/" + CR;   // no crawl delay for other agents
-  
-  private static final String[] TEST_PATHS = new String[] {
-    "http://example.com/a",
-    "http://example.com/a/bloh/foo.html",
-    "http://example.com/b",
-    "http://example.com/c",
-    "http://example.com/b/a/index.html",
-    "http://example.com/foo/bar/baz.html"
-  };
 
-  private static final boolean[] RESULTS = new boolean[] {
-    false,  //  /a
-    false,  //  /a/bloh/foo.html
-    true,   //  /b
-    true,   //  /c
-    false,  //  /b/a/index.html
-    true    //  /foo/bar/baz.html
+  private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR
+      + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c"
+      + CR
+      + "Crawl-delay: 10"
+      + CR // set crawl delay for Agent1 as 10 sec
+      + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh"
+      + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20"
+      + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no
+                                                                          // crawl
+                                                                          // delay
+                                                                          // for
+                                                                          // other
+                                                                          // agents
+
+  private static final String[] TEST_PATHS = new String[] {
+      "http://example.com/a", "http://example.com/a/bloh/foo.html",
+      "http://example.com/b", "http://example.com/c",
+      "http://example.com/b/a/index.html",
+      "http://example.com/foo/bar/baz.html" };
+
+  private static final boolean[] RESULTS = new boolean[] { false, // /a
+      false, // /a/bloh/foo.html
+      true, // /b
+      true, // /c
+      false, // /b/a/index.html
+      true // /foo/bar/baz.html
   };
 
   private HttpRobotRulesParser parser;
@@ -82,41 +75,52 @@ public class TestRobotRulesParser {
   }
 
   /**
-  * Test that the robots rules are interpreted correctly by the robots rules parser. 
-  */
+   * Test that the robots rules are interpreted correctly by the robots rules
+   * parser.
+   */
   @Test
   public void testRobotsAgent() {
-    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, SINGLE_AGENT);
 
-    for(int counter = 0; counter < TEST_PATHS.length; counter++) {
-      assertTrue("testing on agent (" + SINGLE_AGENT + "), and " 
-              + "path " + TEST_PATHS[counter] 
-              + " got " + rules.isAllowed(TEST_PATHS[counter]),
-              rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+      assertTrue(
+          "testing on agent (" + SINGLE_AGENT + "), and " + "path "
+              + TEST_PATHS[counter] + " got "
+              + rules.isAllowed(TEST_PATHS[counter]),
+          rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
     }
 
-    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, MULTIPLE_AGENTS);
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, MULTIPLE_AGENTS);
 
-    for(int counter = 0; counter < TEST_PATHS.length; counter++) {
-      assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and " 
-              + "path " + TEST_PATHS[counter] 
-              + " got " + rules.isAllowed(TEST_PATHS[counter]),
-              rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+      assertTrue(
+          "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
+              + TEST_PATHS[counter] + " got "
+              + rules.isAllowed(TEST_PATHS[counter]),
+          rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
     }
   }
 
   /**
-  * Test that the crawl delay is extracted from the robots file for respective agent. 
-  * If its not specified for a given agent, default value must be returned.
-  */
+   * Test that the crawl delay is extracted from the robots file for respective
+   * agent. If its not specified for a given agent, default value must be
+   * returned.
+   */
   @Test
   public void testCrawlDelay() {
-    // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be returned by the parser
-    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);
-    assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ", (rules.getCrawlDelay() == 10000));
-    
+    // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
+    // returned by the parser
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, SINGLE_AGENT);
+    assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
+        (rules.getCrawlDelay() == 10000));
+
     // for UNKNOWN_AGENT, the default crawl delay must be returned.
-    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, UNKNOWN_AGENT);
-    assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ", (rules.getCrawlDelay() == Long.MIN_VALUE));
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, UNKNOWN_AGENT);
+    assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
+        (rules.getCrawlDelay() == Long.MIN_VALUE));
   }
 }

Modified: nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java (original)
+++ nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java Fri Jan  9 06:34:33 2015
@@ -16,11 +16,9 @@
  */
 package org.apache.nutch.urlfilter.api;
 
-
-
 /**
  * A generic regular expression rule.
- *
+ * 
  * @author J&eacute;r&ocirc;me Charron
  */
 public abstract class RegexRule {
@@ -29,13 +27,15 @@ public abstract class RegexRule {
 
   /**
    * Constructs a new regular expression rule.
-   *
-   * @param sign specifies if this rule must filter-in or filter-out.
-   *        A <code>true</code> value means that any url matching this rule
-   *        must be accepted, a <code>false</code> value means that any url
-   *        matching this rule must be rejected.
-   * @param regex is the regular expression used for matching (see
-   *        {@link #match(String)} method).
+   * 
+   * @param sign
+   *          specifies if this rule must filter-in or filter-out. A
+   *          <code>true</code> value means that any url matching this rule must
+   *          be accepted, a <code>false</code> value means that any url
+   *          matching this rule must be rejected.
+   * @param regex
+   *          is the regular expression used for matching (see
+   *          {@link #match(String)} method).
    */
   protected RegexRule(boolean sign, String regex) {
     this.sign = sign;
@@ -43,19 +43,22 @@ public abstract class RegexRule {
 
   /**
    * Return if this rule is used for filtering-in or out.
-   *
+   * 
    * @return <code>true</code> if any url matching this rule must be accepted,
    *         otherwise <code>false</code>.
    */
-  protected boolean accept() { return sign; }
-  
+  protected boolean accept() {
+    return sign;
+  }
+
   /**
    * Checks if a url matches this rule.
-   * @param url is the url to check.
-   * @return <code>true</code> if the specified url matches this rule,
-   *         otherwise <code>false</code>.
+   * 
+   * @param url
+   *          is the url to check.
+   * @return <code>true</code> if the specified url matches this rule, otherwise
+   *         <code>false</code>.
    */
   protected abstract boolean match(String url);
 
 }
-

Modified: nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java (original)
+++ nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java Fri Jan  9 06:34:33 2015
@@ -37,27 +37,30 @@ import org.apache.hadoop.conf.Configurat
 // Nutch imports
 import org.apache.nutch.net.*;
 
-
 /**
- * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on
- * regular expressions.
- *
- * <p>The regular expressions rules are expressed in a file. The file of rules
- * is provided by each implementation using the
- * {@link #getRulesFile(Configuration)} method.</p>
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
+ * expressions.
+ * 
+ * <p>
+ * The regular expressions rules are expressed in a file. The file of rules is
+ * provided by each implementation using the
+ * {@link #getRulesFile(Configuration)} method.
+ * </p>
  * 
- * <p>The format of this file is made of many rules (one per line):<br/>
+ * <p>
+ * The format of this file is made of many rules (one per line):<br/>
  * <code>
  * [+-]&lt;regex&gt;
  * </code><br/>
- * where plus (<code>+</code>)means go ahead and index it and minus 
- * (<code>-</code>)means no.</p>
-
+ * where plus (<code>+</code>)means go ahead and index it and minus (
+ * <code>-</code>)means no.
+ * </p>
  */
 public abstract class RegexURLFilterBase implements URLFilter {
 
   /** My logger */
-  private final static Logger LOG = LoggerFactory.getLogger(RegexURLFilterBase.class);
+  private final static Logger LOG = LoggerFactory
+      .getLogger(RegexURLFilterBase.class);
 
   /** An array of applicable rules */
   private List<RegexRule> rules;
@@ -65,24 +68,28 @@ public abstract class RegexURLFilterBase
   /** The current configuration */
   private Configuration conf;
 
-
   /**
    * Constructs a new empty RegexURLFilterBase
    */
-  public RegexURLFilterBase() { }
+  public RegexURLFilterBase() {
+  }
 
   /**
    * Constructs a new RegexURLFilter and init it with a file of rules.
-   * @param filename is the name of rules file.
+   * 
+   * @param filename
+   *          is the name of rules file.
    */
-  public RegexURLFilterBase(File filename)
-    throws IOException, IllegalArgumentException {
+  public RegexURLFilterBase(File filename) throws IOException,
+      IllegalArgumentException {
     this(new FileReader(filename));
   }
-  
+
   /**
    * Constructs a new RegexURLFilter and inits it with a list of rules.
-   * @param rules string with a list of rules, one rule per line
+   * 
+   * @param rules
+   *          string with a list of rules, one rule per line
    * @throws IOException
    * @throws IllegalArgumentException
    */
@@ -93,68 +100,82 @@ public abstract class RegexURLFilterBase
 
   /**
    * Constructs a new RegexURLFilter and init it with a Reader of rules.
-   * @param reader is a reader of rules.
+   * 
+   * @param reader
+   *          is a reader of rules.
    */
-  protected RegexURLFilterBase(Reader reader)
-    throws IOException, IllegalArgumentException {
+  protected RegexURLFilterBase(Reader reader) throws IOException,
+      IllegalArgumentException {
     rules = readRules(reader);
   }
-  
+
   /**
    * Creates a new {@link RegexRule}.
-   * @param sign of the regular expression.
-   *        A <code>true</code> value means that any URL matching this rule
-   *        must be included, whereas a <code>false</code>
-   *        value means that any URL matching this rule must be excluded.
-   * @param regex is the regular expression associated to this rule.
+   * 
+   * @param sign
+   *          of the regular expression. A <code>true</code> value means that
+   *          any URL matching this rule must be included, whereas a
+   *          <code>false</code> value means that any URL matching this rule
+   *          must be excluded.
+   * @param regex
+   *          is the regular expression associated to this rule.
    */
   protected abstract RegexRule createRule(boolean sign, String regex);
-  
+
   /**
-   * Returns the name of the file of rules to use for
-   * a particular implementation.
-   * @param conf is the current configuration.
+   * Returns the name of the file of rules to use for a particular
+   * implementation.
+   * 
+   * @param conf
+   *          is the current configuration.
    * @return the name of the resource containing the rules to use.
    */
-  protected abstract Reader getRulesReader(Configuration conf) throws IOException;
-  
-  
-  /* -------------------------- *
-   * <implementation:URLFilter> *
-   * -------------------------- */
-  
+  protected abstract Reader getRulesReader(Configuration conf)
+      throws IOException;
+
+  /*
+   * -------------------------- * <implementation:URLFilter> *
+   * --------------------------
+   */
+
   // Inherited Javadoc
   public String filter(String url) {
     for (RegexRule rule : rules) {
       if (rule.match(url)) {
         return rule.accept() ? url : null;
       }
-    };
+    }
+    ;
     return null;
   }
 
-  /* --------------------------- *
-   * </implementation:URLFilter> *
-   * --------------------------- */
-  
-  
-  /* ----------------------------- *
-   * <implementation:Configurable> *
-   * ----------------------------- */
-  
+  /*
+   * --------------------------- * </implementation:URLFilter> *
+   * ---------------------------
+   */
+
+  /*
+   * ----------------------------- * <implementation:Configurable> *
+   * -----------------------------
+   */
+
   public void setConf(Configuration conf) {
     this.conf = conf;
     Reader reader = null;
     try {
       reader = getRulesReader(conf);
     } catch (Exception e) {
-      if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
-      throw new RuntimeException(e.getMessage(), e);      
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.getMessage());
+      }
+      throw new RuntimeException(e.getMessage(), e);
     }
     try {
       rules = readRules(reader);
     } catch (IOException e) {
-      if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.getMessage());
+      }
       throw new RuntimeException(e.getMessage(), e);
     }
   }
@@ -162,45 +183,51 @@ public abstract class RegexURLFilterBase
   public Configuration getConf() {
     return this.conf;
   }
-  
-  /* ------------------------------ *
-   * </implementation:Configurable> *
-   * ------------------------------ */
-  
+
+  /*
+   * ------------------------------ * </implementation:Configurable> *
+   * ------------------------------
+   */
 
   /**
    * Read the specified file of rules.
-   * @param reader is a reader of regular expressions rules.
+   * 
+   * @param reader
+   *          is a reader of regular expressions rules.
    * @return the corresponding {@RegexRule rules}.
    */
-  private List<RegexRule> readRules(Reader reader)
-    throws IOException, IllegalArgumentException {
+  private List<RegexRule> readRules(Reader reader) throws IOException,
+      IllegalArgumentException {
 
     BufferedReader in = new BufferedReader(reader);
     List<RegexRule> rules = new ArrayList<RegexRule>();
     String line;
-       
-    while((line=in.readLine())!=null) {
+
+    while ((line = in.readLine()) != null) {
       if (line.length() == 0) {
         continue;
       }
-      char first=line.charAt(0);
-      boolean sign=false;
+      char first = line.charAt(0);
+      boolean sign = false;
       switch (first) {
-      case '+' : 
-        sign=true;
+      case '+':
+        sign = true;
         break;
-      case '-' :
-        sign=false;
+      case '-':
+        sign = false;
         break;
-      case ' ' : case '\n' : case '#' :           // skip blank & comment lines
+      case ' ':
+      case '\n':
+      case '#': // skip blank & comment lines
         continue;
-      default :
-        throw new IOException("Invalid first character: "+line);
+      default:
+        throw new IOException("Invalid first character: " + line);
       }
 
       String regex = line.substring(1);
-      if (LOG.isTraceEnabled()) { LOG.trace("Adding rule [" + regex + "]"); }
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Adding rule [" + regex + "]");
+      }
       RegexRule rule = createRule(sign, regex);
       rules.add(rule);
     }
@@ -209,18 +236,20 @@ public abstract class RegexURLFilterBase
 
   /**
    * Filter the standard input using a RegexURLFilterBase.
-   * @param filter is the RegexURLFilterBase to use for filtering the
-   *        standard input.
-   * @param args some optional parameters (not used).
+   * 
+   * @param filter
+   *          is the RegexURLFilterBase to use for filtering the standard input.
+   * @param args
+   *          some optional parameters (not used).
    */
   public static void main(RegexURLFilterBase filter, String args[])
-    throws IOException, IllegalArgumentException {
+      throws IOException, IllegalArgumentException {
 
     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
     String line;
-    while((line=in.readLine())!=null) {
+    while ((line = in.readLine()) != null) {
       String out = filter.filter(line);
-      if (out!=null) {
+      if (out != null) {
         System.out.print("+");
         System.out.println(out);
       } else {

Modified: nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java (original)
+++ nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java Fri Jan  9 06:34:33 2015
@@ -42,52 +42,52 @@ import org.apache.nutch.net.URLFilter;
  * JUnit based test of class <code>RegexURLFilterBase</code>.
  */
 
-//@RunWith(Suite.class)
-//@Suite.SuiteClasses({TestAutomatonURLFilter.class, TestRegexURLFilter.class})
+// @RunWith(Suite.class)
+// @Suite.SuiteClasses({TestAutomatonURLFilter.class, TestRegexURLFilter.class})
 public abstract class RegexURLFilterBaseTest {
-  
+
   /** My logger */
-  protected static final Logger LOG = LoggerFactory.getLogger(RegexURLFilterBaseTest.class);  
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(RegexURLFilterBaseTest.class);
 
-  private final static String SEPARATOR = System.getProperty("file.separator");  
+  private final static String SEPARATOR = System.getProperty("file.separator");
   private final static String SAMPLES = System.getProperty("test.data", ".");
-  
+
   protected abstract URLFilter getURLFilter(Reader rules);
 
   protected void bench(int loops, String file) {
     try {
-      bench(loops,
-            new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
-            new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+      bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+          new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
     } catch (Exception e) {
       fail(e.toString());
     }
   }
-  
+
   protected void bench(int loops, Reader rules, Reader urls) {
     long start = System.currentTimeMillis();
     try {
       URLFilter filter = getURLFilter(rules);
       FilteredURL[] expected = readURLFile(urls);
-      for (int i=0; i<loops; i++) {
+      for (int i = 0; i < loops; i++) {
         test(filter, expected);
       }
     } catch (Exception e) {
       fail(e.toString());
     }
-    LOG.info("bench time (" + loops + ") " +
-             (System.currentTimeMillis()-start) + "ms");
+    LOG.info("bench time (" + loops + ") "
+        + (System.currentTimeMillis() - start) + "ms");
   }
-  
+
   protected void test(String file) {
     try {
       test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
-           new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+          new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
     } catch (Exception e) {
       fail(e.toString());
     }
   }
-  
+
   protected void test(Reader rules, Reader urls) {
     try {
       test(getURLFilter(rules), readURLFile(urls));
@@ -95,9 +95,9 @@ public abstract class RegexURLFilterBase
       fail(e.toString());
     }
   }
-  
+
   protected void test(URLFilter filter, FilteredURL[] expected) {
-    for (int i=0; i<expected.length; i++) {
+    for (int i = 0; i < expected.length; i++) {
       String result = filter.filter(expected[i].url);
       if (result != null) {
         assertTrue(expected[i].url, expected[i].sign);
@@ -106,37 +106,37 @@ public abstract class RegexURLFilterBase
       }
     }
   }
-  
+
   private static FilteredURL[] readURLFile(Reader reader) throws IOException {
     BufferedReader in = new BufferedReader(reader);
     List<FilteredURL> list = new ArrayList<FilteredURL>();
     String line;
-    while((line=in.readLine()) != null) {
+    while ((line = in.readLine()) != null) {
       if (line.length() != 0) {
         list.add(new FilteredURL(line));
       }
     }
     return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]);
   }
-    
+
   private static class FilteredURL {
-  
+
     boolean sign;
     String url;
 
     FilteredURL(String line) {
       switch (line.charAt(0)) {
-      case '+' : 
+      case '+':
         sign = true;
         break;
-      case '-' :
+      case '-':
         sign = false;
         break;
-      default :
+      default:
         // Simply ignore...
       }
       url = line.substring(1);
     }
   }
-  
+
 }

Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java Fri Jan  9 06:34:33 2015
@@ -39,7 +39,7 @@ import org.apache.nutch.util.Bytes;
  * @author J&eacute;r&ocirc;me Charron
  */
 public class RelTagIndexingFilter implements IndexingFilter {
-  
+
   private Configuration conf;
 
   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
@@ -50,10 +50,9 @@ public class RelTagIndexingFilter implem
   }
 
   /**
-   * Gets all the fields for a given {@link WebPage}
-   * Many datastores need to setup the mapreduce job by specifying the fields
-   * needed. All extensions that work on WebPage are able to specify what fields
-   * they need.
+   * Gets all the fields for a given {@link WebPage} Many datastores need to
+   * setup the mapreduce job by specifying the fields needed. All extensions
+   * that work on WebPage are able to specify what fields they need.
    */
   @Override
   public Collection<Field> getFields() {
@@ -73,24 +72,28 @@ public class RelTagIndexingFilter implem
   public Configuration getConf() {
     return this.conf;
   }
-  
+
   /**
    * The {@link RelTagIndexingFilter} filter object.
-   *  
-   * @param doc The {@link NutchDocument} object
-   * @param url URL to be filtered for rel-tag's
-   * @param page {@link WebPage} object relative to the URL
+   * 
+   * @param doc
+   *          The {@link NutchDocument} object
+   * @param url
+   *          URL to be filtered for rel-tag's
+   * @param page
+   *          {@link WebPage} object relative to the URL
    * @return filtered NutchDocument
    */
   @Override
-  public NutchDocument filter(NutchDocument doc, String url, WebPage page) throws IndexingException {
-  // Check if some Rel-Tags found, possibly put there by RelTagParser
+  public NutchDocument filter(NutchDocument doc, String url, WebPage page)
+      throws IndexingException {
+    // Check if some Rel-Tags found, possibly put there by RelTagParser
     ByteBuffer bb = page.getMetadata().get(new Utf8(RelTagParser.REL_TAG));
-		
+
     if (bb != null) {
       String[] tags = Bytes.toString(bb).split("\t");
       for (int i = 0; i < tags.length; i++) {
-	    doc.add("tag", tags[i]);
+        doc.add("tag", tags[i]);
       }
     }
     return doc;

Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Fri Jan  9 06:34:33 2015
@@ -74,26 +74,26 @@ public class RelTagParser implements Par
       if (node.getNodeType() == Node.ELEMENT_NODE) {
         // Look for <a> tag
         if ("a".equalsIgnoreCase(node.getNodeName())) {
-	  NamedNodeMap attrs = node.getAttributes();
-	  Node hrefNode = attrs.getNamedItem("href");
-	  // Checks that it contains a href attribute
-	  if (hrefNode != null) {
-	    Node relNode = attrs.getNamedItem("rel");
-	    // Checks that it contains a rel attribute too
-	    if (relNode != null) {
-	      // Finaly checks that rel=tag
-	      if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
-	        String tag = parseTag(hrefNode.getNodeValue());
-	        if (!StringUtil.isEmpty(tag)) {
-	          if(!tags.contains(tag)){
+          NamedNodeMap attrs = node.getAttributes();
+          Node hrefNode = attrs.getNamedItem("href");
+          // Checks that it contains a href attribute
+          if (hrefNode != null) {
+            Node relNode = attrs.getNamedItem("rel");
+            // Checks that it contains a rel attribute too
+            if (relNode != null) {
+              // Finaly checks that rel=tag
+              if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
+                String tag = parseTag(hrefNode.getNodeValue());
+                if (!StringUtil.isEmpty(tag)) {
+                  if (!tags.contains(tag)) {
                     tags.add(tag);
-		    LOG.debug("Adding tag: " + tag + " to tag set.");
+                    LOG.debug("Adding tag: " + tag + " to tag set.");
                   }
-	        }
-	      }
-	    }
-	  }
-	}
+                }
+              }
+            }
+          }
+        }
       }
 
       // Recurse
@@ -108,11 +108,13 @@ public class RelTagParser implements Par
       try {
         URL u = new URL(url);
         String path = u.getPath();
-        tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), "UTF-8");
+        tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1),
+            "UTF-8");
       } catch (Exception e) {
         // Malformed tag...
         tag = null;
-      } return tag;
+      }
+      return tag;
     }
   }
 
@@ -136,12 +138,11 @@ public class RelTagParser implements Par
     FIELDS.add(WebPage.Field.BASE_URL);
     FIELDS.add(WebPage.Field.METADATA);
   }
-  
+
   /**
-   * Gets all the fields for a given {@link WebPage}
-   * Many datastores need to setup the mapreduce job by specifying the fields
-   * needed. All extensions that work on WebPage are able to specify what fields
-   * they need.
+   * Gets all the fields for a given {@link WebPage} Many datastores need to
+   * setup the mapreduce job by specifying the fields needed. All extensions
+   * that work on WebPage are able to specify what fields they need.
    */
   @Override
   public Collection<Field> getFields() {

Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java Fri Jan  9 06:34:33 2015
@@ -28,13 +28,13 @@ import java.nio.ByteBuffer;
 import static org.junit.Assert.*;
 
 /**
- *JUnit test case for {@link RelTagIndexingFilter} which 
- *simply asserts that a 'tag' field is obtained by the filter.
- *
- *@author lewismc
+ * JUnit test case for {@link RelTagIndexingFilter} which simply asserts that a
+ * 'tag' field is obtained by the filter.
+ * 
+ * @author lewismc
  */
 
-  public class TestRelTagIndexingFilter {
+public class TestRelTagIndexingFilter {
 
   @Test
   public void testRelTagFields() throws Exception {
@@ -57,4 +57,3 @@ import static org.junit.Assert.*;
     assertTrue("check for 'tag' field", doc.getFieldNames().contains("tag"));
   }
 }
-  
\ No newline at end of file

Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java Fri Jan  9 06:34:33 2015
@@ -38,15 +38,15 @@ import java.nio.ByteBuffer;
 import static org.junit.Assert.assertEquals;
 
 /**
- * Junit test for {@link RelTagParser} based mainly John Xing's parser tests.
- * We are not concerned with actual parse text within the sample file, instead
- * we assert that the rel-tags we expect are found in the WebPage metadata.
- * To check the parser is working as expected we unwrap the ByteBuffer obtained 
- * from metadata, the same type as we use in expected (String). So just the 
+ * Junit test for {@link RelTagParser} based mainly John Xing's parser tests. We
+ * are not concerned with actual parse text within the sample file, instead we
+ * assert that the rel-tags we expect are found in the WebPage metadata. To
+ * check the parser is working as expected we unwrap the ByteBuffer obtained
+ * from metadata, the same type as we use in expected (String). So just the
  * other way around as we wrapped the metadata value.
  * 
  * @author lewismc
- *
+ * 
  */
 public class TestRelTagParser {
 
@@ -58,14 +58,15 @@ public class TestRelTagParser {
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/microformats-reltag/build.xml during plugin compilation.
   private String sampleFile = "microformats_reltag_test.html";
-  
+
   // rel-tag's we expect to be extracted from page.getMetadata()
   private String expectedRelTags = "Category:Specifications	Category:rel-tag	";
-  
+
   private Configuration conf;
-  
+
   @Test
-  public void testRelTagParser() throws ParseException, ProtocolException, IOException {
+  public void testRelTagParser() throws ParseException, ProtocolException,
+      IOException {
     conf = NutchConfiguration.create();
     conf.set("file.content.limit", "-1");
     @SuppressWarnings("unused")
@@ -85,14 +86,14 @@ public class TestRelTagParser {
     String mtype = mimeutil.getMimeType(file);
     page.setContentType(new Utf8(mtype));
     parse = new ParseUtil(conf).parse(urlString, page);
-    //begin assertion for tests
+    // begin assertion for tests
     ByteBuffer bbuf = page.getMetadata().get(new Utf8("Rel-Tag"));
     byte[] byteArray = new byte[bbuf.remaining()];
     bbuf.get(byteArray);
     String s = new String(byteArray);
-    //bbuf.flip();
-    assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter", 
-      expectedRelTags, s);
+    // bbuf.flip();
+    assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter",
+        expectedRelTags, s);
   }
-  
+
 }
\ No newline at end of file