You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC

svn commit: r1655526 [22/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...

Modified: nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java (original)
+++ nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * (number of "hops" from seed URLs).
  */
 package org.apache.nutch.scoring.depth;
+

Modified: nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java (original)
+++ nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java Thu Jan 29 05:38:59 2015
@@ -32,8 +32,7 @@ import org.apache.nutch.protocol.Content
 import org.apache.nutch.scoring.ScoringFilter;
 import org.apache.nutch.scoring.ScoringFilterException;
 
-public class LinkAnalysisScoringFilter
-  implements ScoringFilter {
+public class LinkAnalysisScoringFilter implements ScoringFilter {
 
   private Configuration conf;
   private float normalizedScore = 1.00f;
@@ -52,46 +51,44 @@ public class LinkAnalysisScoringFilter
   }
 
   public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
-    ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
-    CrawlDatum adjust, int allCount)
-    throws ScoringFilterException {
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
     return adjust;
   }
 
   public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
-    throws ScoringFilterException {
+      throws ScoringFilterException {
     return datum.getScore() * initSort;
   }
 
   public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
-    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
-    throws ScoringFilterException {
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
     return (normalizedScore * dbDatum.getScore());
   }
 
   public void initialScore(Text url, CrawlDatum datum)
-    throws ScoringFilterException {
+      throws ScoringFilterException {
     datum.setScore(0.0f);
   }
 
   public void injectedScore(Text url, CrawlDatum datum)
-    throws ScoringFilterException {
+      throws ScoringFilterException {
   }
 
   public void passScoreAfterParsing(Text url, Content content, Parse parse)
-    throws ScoringFilterException {
-    parse.getData().getContentMeta().set(Nutch.SCORE_KEY,
-      content.getMetadata().get(Nutch.SCORE_KEY));
+      throws ScoringFilterException {
+    parse.getData().getContentMeta()
+        .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
   }
 
   public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
-    throws ScoringFilterException {
+      throws ScoringFilterException {
     content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
   }
 
   public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
-    List<CrawlDatum> inlinked)
-    throws ScoringFilterException {
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
     // nothing to do
   }
 

Modified: nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java (original)
+++ nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * {@link org.apache.nutch.scoring.webgraph.WebGraph}.
  */
 package org.apache.nutch.scoring.link;
+

Modified: nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original)
+++ nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Thu Jan 29 05:38:59 2015
@@ -41,17 +41,17 @@ import org.apache.nutch.scoring.ScoringF
 
 /**
  * This plugin implements a variant of an Online Page Importance Computation
- * (OPIC) score, described in this paper:
- * <a href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/>
- * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003),
- * Adaptive On-Line Page Importance Computation
- * </a>.
+ * (OPIC) score, described in this paper: <a
+ * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/>
+ * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive
+ * On-Line Page Importance Computation </a>.
  * 
  * @author Andrzej Bialecki
  */
 public class OPICScoringFilter implements ScoringFilter {
 
-  private final static Logger LOG = LoggerFactory.getLogger(OPICScoringFilter.class);
+  private final static Logger LOG = LoggerFactory
+      .getLogger(OPICScoringFilter.class);
 
   private Configuration conf;
   private float scoreInjected;
@@ -72,28 +72,35 @@ public class OPICScoringFilter implement
     countFiltered = conf.getBoolean("db.score.count.filtered", false);
   }
 
-  public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException {
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
   }
 
-  /** Set to 0.0f (unknown value) - inlink contributions will bring it to
-   * a correct level. Newly discovered pages have at least one inlink. */
-  public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException {
+  /**
+   * Set to 0.0f (unknown value) - inlink contributions will bring it to a
+   * correct level. Newly discovered pages have at least one inlink.
+   */
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
     datum.setScore(0.0f);
   }
 
   /** Use {@link CrawlDatum#getScore()}. */
-  public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException {
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
     return datum.getScore() * initSort;
   }
 
   /** Increase the score by a sum of inlinked scores. */
-  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException {
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
     float adjust = 0.0f;
     for (int i = 0; i < inlinked.size(); i++) {
       CrawlDatum linked = inlinked.get(i);
       adjust += linked.getScore();
     }
-    if (old == null) old = datum;
+    if (old == null)
+      old = datum;
     datum.setScore(old.getScore() + adjust);
   }
 
@@ -104,11 +111,17 @@ public class OPICScoringFilter implement
 
   /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
   public void passScoreAfterParsing(Text url, Content content, Parse parse) {
-    parse.getData().getContentMeta().set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
+    parse.getData().getContentMeta()
+        .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
   }
 
-  /** Get a float value from Fetcher.SCORE_KEY, divide it by the number of outlinks and apply. */
-  public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException {
+  /**
+   * Get a float value from Fetcher.SCORE_KEY, divide it by the number of
+   * outlinks and apply.
+   */
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
     float score = scoreInjected;
     String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
     if (scoreString != null) {
@@ -135,7 +148,7 @@ public class OPICScoringFilter implement
       try {
         String toHost = new URL(target.getKey().toString()).getHost();
         String fromHost = new URL(fromUrl.toString()).getHost();
-        if(toHost.equalsIgnoreCase(fromHost)){
+        if (toHost.equalsIgnoreCase(fromHost)) {
           target.getValue().setScore(internalScore);
         } else {
           target.getValue().setScore(externalScore);
@@ -151,8 +164,10 @@ public class OPICScoringFilter implement
     return adjust;
   }
 
-  /** Dampen the boost value by scorePower.*/
-  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
-    return (float)Math.pow(dbDatum.getScore(), scorePower) * initScore;
+  /** Dampen the boost value by scorePower. */
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+    return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore;
   }
 }

Modified: nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java (original)
+++ nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * (OPIC) algorithm.
  */
 package org.apache.nutch.scoring.opic;
+

Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java Thu Jan 29 05:38:59 2015
@@ -52,22 +52,24 @@ public class CollectionManager extends C
   transient Map<String, Subcollection> collectionMap = new HashMap<String, Subcollection>();
 
   transient URL configfile;
-  
+
   public CollectionManager(Configuration conf) {
     super(conf);
     init();
   }
-  
-  /** 
+
+  /**
    * Used for testing
    */
-  protected CollectionManager(){
+  protected CollectionManager() {
     super(NutchConfiguration.create());
   }
 
-  protected void init(){
+  protected void init() {
     try {
-      if (LOG.isInfoEnabled()) { LOG.info("initializing CollectionManager"); }
+      if (LOG.isInfoEnabled()) {
+        LOG.info("initializing CollectionManager");
+      }
       // initialize known subcollections
       configfile = getConf().getResource(
           getConf().get("subcollections.config", DEFAULT_FILE_NAME));
@@ -92,7 +94,7 @@ public class CollectionManager extends C
       if (LOG.isInfoEnabled()) {
         LOG.info("file has " + nodeList.getLength() + " elements");
       }
-      
+
       for (int i = 0; i < nodeList.getLength(); i++) {
         Element scElem = (Element) nodeList.item(i);
         Subcollection subCol = new Subcollection(getConf());
@@ -103,18 +105,18 @@ public class CollectionManager extends C
       LOG.info("Cannot find collections");
     }
   }
-  
+
   public static CollectionManager getCollectionManager(Configuration conf) {
     String key = "collectionmanager";
     ObjectCache objectCache = ObjectCache.get(conf);
-    CollectionManager impl = (CollectionManager)objectCache.getObject(key);
+    CollectionManager impl = (CollectionManager) objectCache.getObject(key);
     if (impl == null) {
       try {
         if (LOG.isInfoEnabled()) {
           LOG.info("Instantiating CollectionManager");
         }
-        impl=new CollectionManager(conf);
-        objectCache.setObject(key,impl);
+        impl = new CollectionManager(conf);
+        objectCache.setObject(key, impl);
       } catch (Exception e) {
         throw new RuntimeException("Couldn't create CollectionManager", e);
       }
@@ -165,7 +167,7 @@ public class CollectionManager extends C
 
   /**
    * Return names of collections url is part of
-   *
+   * 
    * @param url
    *          The url to test against Collections
    * @return Subcollections
@@ -203,8 +205,8 @@ public class CollectionManager extends C
    */
   public void save() throws IOException {
     try {
-      final FileOutputStream fos = new FileOutputStream(new File(configfile
-          .getFile()));
+      final FileOutputStream fos = new FileOutputStream(new File(
+          configfile.getFile()));
       final Document doc = new DocumentImpl();
       final Element collections = doc
           .createElement(Subcollection.TAG_COLLECTIONS);

Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Thu Jan 29 05:38:59 2015
@@ -32,20 +32,20 @@ import org.w3c.dom.NodeList;
  * SubCollection represents a subset of index, you can define url patterns that
  * will indicate that particular page (url) is part of SubCollection.
  */
-public class Subcollection extends Configured implements URLFilter{
-  
-  public static final String TAG_COLLECTIONS="subcollections";
-  public static final String TAG_COLLECTION="subcollection";
-  public static final String TAG_WHITELIST="whitelist";
-  public static final String TAG_BLACKLIST="blacklist";
-  public static final String TAG_NAME="name";
-  public static final String TAG_KEY="key";
-  public static final String TAG_ID="id";
+public class Subcollection extends Configured implements URLFilter {
+
+  public static final String TAG_COLLECTIONS = "subcollections";
+  public static final String TAG_COLLECTION = "subcollection";
+  public static final String TAG_WHITELIST = "whitelist";
+  public static final String TAG_BLACKLIST = "blacklist";
+  public static final String TAG_NAME = "name";
+  public static final String TAG_KEY = "key";
+  public static final String TAG_ID = "id";
 
   List<String> blackList = new ArrayList<String>();
   List<String> whiteList = new ArrayList<String>();
 
-  /** 
+  /**
    * SubCollection identifier
    */
   String id;
@@ -55,12 +55,12 @@ public class Subcollection extends Confi
    */
   String key;
 
-  /** 
+  /**
    * SubCollection name
    */
   String name;
 
-  /** 
+  /**
    * SubCollection whitelist as String
    */
   String wlString;
@@ -70,31 +70,37 @@ public class Subcollection extends Confi
    */
   String blString;
 
-  /** public Constructor
+  /**
+   * public Constructor
    * 
-   * @param id id of SubCollection
-   * @param name name of SubCollection
+   * @param id
+   *          id of SubCollection
+   * @param name
+   *          name of SubCollection
    */
   public Subcollection(String id, String name, Configuration conf) {
     this(id, name, null, conf);
   }
 
-  /** public Constructor
-   *
-   * @param id id of SubCollection
-   * @param name name of SubCollection
+  /**
+   * public Constructor
+   * 
+   * @param id
+   *          id of SubCollection
+   * @param name
+   *          name of SubCollection
    */
   public Subcollection(String id, String name, String key, Configuration conf) {
     this(conf);
-    this.id=id;
+    this.id = id;
     this.key = key;
     this.name = name;
   }
 
-  public Subcollection(Configuration conf){
+  public Subcollection(Configuration conf) {
     super(conf);
   }
-  
+
   /**
    * @return Returns the name
    */
@@ -232,7 +238,8 @@ public class Subcollection extends Confi
   /**
    * Set contents of blacklist from String
    * 
-   * @param list the blacklist contents
+   * @param list
+   *          the blacklist contents
    */
   public void setBlackList(String list) {
     this.blString = list;
@@ -242,7 +249,8 @@ public class Subcollection extends Confi
   /**
    * Set contents of whitelist from String
    * 
-   * @param list the whitelist contents
+   * @param list
+   *          the whitelist contents
    */
   public void setWhiteList(String list) {
     this.wlString = list;

Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -35,21 +35,22 @@ import org.apache.nutch.collection.Subco
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 
-
-public class SubcollectionIndexingFilter extends Configured implements IndexingFilter {
+public class SubcollectionIndexingFilter extends Configured implements
+    IndexingFilter {
 
   private Configuration conf;
 
-  public SubcollectionIndexingFilter(){
+  public SubcollectionIndexingFilter() {
     super(NutchConfiguration.create());
   }
-  
+
   public SubcollectionIndexingFilter(Configuration conf) {
     super(conf);
   }
-  
+
   /**
-   * @param Configuration conf
+   * @param Configuration
+   *          conf
    */
   public void setConf(Configuration conf) {
     this.conf = conf;
@@ -63,7 +64,6 @@ public class SubcollectionIndexingFilter
     return this.conf;
   }
 
-  
   /**
    * Doc field name
    */
@@ -72,7 +72,8 @@ public class SubcollectionIndexingFilter
   /**
    * Logger
    */
-  public static final Logger LOG = LoggerFactory.getLogger(SubcollectionIndexingFilter.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(SubcollectionIndexingFilter.class);
 
   /**
    * "Mark" document to be a part of subcollection
@@ -81,7 +82,8 @@ public class SubcollectionIndexingFilter
    * @param url
    */
   private void addSubCollectionField(NutchDocument doc, String url) {
-    for (Subcollection coll : CollectionManager.getCollectionManager(getConf()).getSubCollections(url)) {
+    for (Subcollection coll : CollectionManager.getCollectionManager(getConf())
+        .getSubCollections(url)) {
       if (coll.getKey() == null) {
         doc.add(fieldName, coll.getName());
       } else {
@@ -90,7 +92,8 @@ public class SubcollectionIndexingFilter
     }
   }
 
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
     String sUrl = url.toString();
     addSubCollectionField(doc, sUrl);
     return doc;

Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java Thu Jan 29 05:38:59 2015
@@ -22,3 +22,4 @@
  * {@link org.apache.nutch.collection}.
  */
 package org.apache.nutch.indexer.subcollection;
+

Modified: nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java Thu Jan 29 05:38:59 2015
@@ -25,31 +25,34 @@ import org.junit.Assert;
 import org.junit.Test;
 
 public class TestSubcollection {
-  
-  /**Test filtering logic
+
+  /**
+   * Test filtering logic
    * 
    * @throws Exception
    */
   @Test
   public void testFilter() throws Exception {
-    Subcollection sc=new Subcollection(NutchConfiguration.create());
+    Subcollection sc = new Subcollection(NutchConfiguration.create());
     sc.setWhiteList("www.nutch.org\nwww.apache.org");
     sc.setBlackList("jpg\nwww.apache.org/zecret/");
-    
-    //matches whitelist
-    Assert.assertEquals("http://www.apache.org/index.html", sc.filter("http://www.apache.org/index.html"));
-    
-    //matches blacklist
-    Assert.assertEquals(null, sc.filter("http://www.apache.org/zecret/index.html"));
+
+    // matches whitelist
+    Assert.assertEquals("http://www.apache.org/index.html",
+        sc.filter("http://www.apache.org/index.html"));
+
+    // matches blacklist
+    Assert.assertEquals(null,
+        sc.filter("http://www.apache.org/zecret/index.html"));
     Assert.assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg"));
-    
-    //no match
+
+    // no match
     Assert.assertEquals(null, sc.filter("http://www.google.com/"));
   }
-  
+
   @Test
-  public void testInput(){
-    StringBuffer xml=new StringBuffer();
+  public void testInput() {
+    StringBuffer xml = new StringBuffer();
     xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
     xml.append("<!-- just a comment -->");
     xml.append("<subcollections>");
@@ -65,44 +68,45 @@ public class TestSubcollection {
     xml.append("</blacklist>");
     xml.append("</subcollection>");
     xml.append("</subcollections>");
-    
-    InputStream is=new ByteArrayInputStream(xml.toString().getBytes());
-    
-    CollectionManager cm=new CollectionManager();
+
+    InputStream is = new ByteArrayInputStream(xml.toString().getBytes());
+
+    CollectionManager cm = new CollectionManager();
     cm.parse(is);
-    
-    Collection<?> c=cm.getAll();
-    
+
+    Collection<?> c = cm.getAll();
+
     // test that size matches
-    Assert.assertEquals(1,c.size());
-    
-    Subcollection collection=(Subcollection)c.toArray()[0];
-    
-    //test collection id
+    Assert.assertEquals(1, c.size());
+
+    Subcollection collection = (Subcollection) c.toArray()[0];
+
+    // test collection id
     Assert.assertEquals("nutch", collection.getId());
-    
-    //test collection name
+
+    // test collection name
     Assert.assertEquals("nutch collection", collection.getName());
 
-    //test whitelist
-    Assert.assertEquals(2,collection.whiteList.size());
-    
-    String wlUrl=(String)collection.whiteList.get(0);
+    // test whitelist
+    Assert.assertEquals(2, collection.whiteList.size());
+
+    String wlUrl = (String) collection.whiteList.get(0);
     Assert.assertEquals("http://lucene.apache.org/nutch/", wlUrl);
 
-    wlUrl=(String)collection.whiteList.get(1);
+    wlUrl = (String) collection.whiteList.get(1);
     Assert.assertEquals("http://wiki.apache.org/nutch/", wlUrl);
-    
-    //matches whitelist
-    Assert.assertEquals("http://lucene.apache.org/nutch/", collection.filter("http://lucene.apache.org/nutch/"));
 
-    //test blacklist
-    Assert.assertEquals(1,collection.blackList.size());
+    // matches whitelist
+    Assert.assertEquals("http://lucene.apache.org/nutch/",
+        collection.filter("http://lucene.apache.org/nutch/"));
+
+    // test blacklist
+    Assert.assertEquals(1, collection.blackList.size());
 
-    String blUrl=(String)collection.blackList.get(0);
+    String blUrl = (String) collection.blackList.get(0);
     Assert.assertEquals("http://www.xxx.yyy", blUrl);
 
-    //no match
+    // no match
     Assert.assertEquals(null, collection.filter("http://www.google.com/"));
   }
 }

Modified: nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -34,23 +34,25 @@ import org.apache.nutch.util.domain.Doma
 
 /**
  * Adds the Top level domain extensions to the index
+ * 
  * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
  */
 public class TLDIndexingFilter implements IndexingFilter {
-  public static final Logger LOG = LoggerFactory.getLogger(TLDIndexingFilter.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(TLDIndexingFilter.class);
 
   private Configuration conf;
 
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
-  throws IndexingException {
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
 
     try {
       URL url = new URL(urlText.toString());
       DomainSuffix d = URLUtil.getDomainSuffix(url);
-      
+
       doc.add("tld", d.getDomain());
-      
-    }catch (Exception ex) {
+
+    } catch (Exception ex) {
       LOG.warn(ex.toString());
     }
 

Modified: nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java (original)
+++ nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java Thu Jan 29 05:38:59 2015
@@ -35,9 +35,9 @@ import org.apache.nutch.scoring.ScoringF
 import org.apache.nutch.util.domain.DomainSuffix;
 import org.apache.nutch.util.domain.DomainSuffixes;
 
-
 /**
  * Scoring filter to boost tlds.
+ * 
  * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
  */
 public class TLDScoringFilter implements ScoringFilter {
@@ -56,10 +56,10 @@ public class TLDScoringFilter implements
     NutchField tlds = doc.getField("tld");
     float boost = 1.0f;
 
-    if(tlds != null) {
-      for(Object tld : tlds.getValues()) {
+    if (tlds != null) {
+      for (Object tld : tlds.getValues()) {
         DomainSuffix entry = tldEntries.get(tld.toString());
-        if(entry != null)
+        if (entry != null)
           boost *= entry.getBoost();
       }
     }
@@ -93,9 +93,8 @@ public class TLDScoringFilter implements
       throws ScoringFilterException {
   }
 
-  public void updateDbScore(Text url, CrawlDatum old,
-                            CrawlDatum datum, List<CrawlDatum> inlinked)
-  throws ScoringFilterException {
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
   }
 
   public Configuration getConf() {
@@ -105,9 +104,10 @@ public class TLDScoringFilter implements
   public void setConf(Configuration conf) {
     this.conf = conf;
   }
-  public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, 
-          Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust,
-          int allCount) throws ScoringFilterException {
+
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
     return adjust;
   }
 

Modified: nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java Thu Jan 29 05:38:59 2015
@@ -32,12 +32,11 @@ import org.apache.nutch.net.*;
 import org.apache.nutch.urlfilter.api.RegexRule;
 import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
 
-
 /**
- * RegexURLFilterBase implementation based on the
- * <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
- * Finite-State Automata for Java<sup>TM</sup>.
- *
+ * RegexURLFilterBase implementation based on the <a
+ * href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
+ * Automata for Java<sup>TM</sup>.
+ * 
  * @author J&eacute;r&ocirc;me Charron
  * @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
  */
@@ -49,24 +48,24 @@ public class AutomatonURLFilter extends
     super();
   }
 
-  public AutomatonURLFilter(String filename)
-    throws IOException, PatternSyntaxException {
+  public AutomatonURLFilter(String filename) throws IOException,
+      PatternSyntaxException {
     super(filename);
   }
 
-  AutomatonURLFilter(Reader reader)
-    throws IOException, IllegalArgumentException {
+  AutomatonURLFilter(Reader reader) throws IOException,
+      IllegalArgumentException {
     super(reader);
   }
 
-  
-  /* ----------------------------------- *
-   * <implementation:RegexURLFilterBase> *
-   * ----------------------------------- */
-  
+  /*
+   * ----------------------------------- * <implementation:RegexURLFilterBase> *
+   * -----------------------------------
+   */
+
   /**
-   * Rules specified as a config property will override rules specified
-   * as a config file.
+   * Rules specified as a config property will override rules specified as a
+   * config file.
    */
   protected Reader getRulesReader(Configuration conf) throws IOException {
     String stringRules = conf.get(URLFILTER_AUTOMATON_RULES);
@@ -81,21 +80,20 @@ public class AutomatonURLFilter extends
   protected RegexRule createRule(boolean sign, String regex) {
     return new Rule(sign, regex);
   }
-  
-  /* ------------------------------------ *
-   * </implementation:RegexURLFilterBase> *
-   * ------------------------------------ */
 
-  
+  /*
+   * ------------------------------------ * </implementation:RegexURLFilterBase>
+   * * ------------------------------------
+   */
+
   public static void main(String args[]) throws IOException {
     main(new AutomatonURLFilter(), args);
   }
 
-
   private class Rule extends RegexRule {
-    
+
     private RunAutomaton automaton;
-    
+
     Rule(boolean sign, String regex) {
       super(sign, regex);
       automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
@@ -105,5 +103,5 @@ public class AutomatonURLFilter extends
       return automaton.run(url);
     }
   }
-  
+
 }

Modified: nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java Thu Jan 29 05:38:59 2015
@@ -26,10 +26,9 @@ import org.apache.nutch.urlfilter.api.Re
 import org.junit.Assert;
 import org.junit.Test;
 
-
 /**
  * JUnit based test of class <code>AutomatonURLFilter</code>.
- *
+ * 
  * @author J&eacute;r&ocirc;me Charron
  */
 public class TestAutomatonURLFilter extends RegexURLFilterBaseTest {
@@ -42,7 +41,7 @@ public class TestAutomatonURLFilter exte
       return null;
     }
   }
-  
+
   @Test
   public void test() {
     test("WholeWebCrawling");

Modified: nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java Thu Jan 29 05:38:59 2015
@@ -35,35 +35,48 @@ import org.apache.nutch.util.URLUtil;
 import org.apache.nutch.util.domain.DomainSuffix;
 
 /**
- * <p>Filters URLs based on a file containing domain suffixes, domain names, and
+ * <p>
+ * Filters URLs based on a file containing domain suffixes, domain names, and
  * hostnames. Only a url that matches one of the suffixes, domains, or hosts
- * present in the file is allowed.</p>
+ * present in the file is allowed.
+ * </p>
  * 
- * <p>Urls are checked in order of domain suffix, domain name, and hostname
- * against entries in the domain file. The domain file would be setup as follows
- * with one entry per line:
- * 
- * <pre> com apache.org www.apache.org </pre>
- * 
- * <p>The first line is an example of a filter that would allow all .com
- * domains. The second line allows all urls from apache.org and all of its
- * subdomains such as lucene.apache.org and hadoop.apache.org. The third line
- * would allow only urls from www.apache.org. There is no specific ordering to
- * entries. The entries are from more general to more specific with the more
- * general overridding the more specific.</p>
+ * <p>
+ * Urls are checked in order of domain suffix, domain name, and hostname against
+ * entries in the domain file. The domain file would be setup as follows with
+ * one entry per line:
+ * 
+ * <pre>
+ * com apache.org www.apache.org
+ * </pre>
+ * 
+ * <p>
+ * The first line is an example of a filter that would allow all .com domains.
+ * The second line allows all urls from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would allow
+ * only urls from www.apache.org. There is no specific ordering to entries. The
+ * entries are from more general to more specific with the more general
+ * overridding the more specific.
+ * </p>
  * 
  * The domain file defaults to domain-urlfilter.txt in the classpath but can be
  * overridden using the:
  * 
- * <ul> <ol>property "urlfilter.domain.file" in ./conf/nutch-*.xml, and</ol>
- * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
+ * <ul>
+ * <ol>
+ * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and
+ * </ol>
+ * <ol>
+ * attribute "file" in plugin.xml of this plugin
+ * </ol>
+ * </ul>
  * 
  * the attribute "file" has higher precedence if defined.
  */
-public class DomainURLFilter
-  implements URLFilter {
+public class DomainURLFilter implements URLFilter {
 
-  private static final Logger LOG = LoggerFactory.getLogger(DomainURLFilter.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DomainURLFilter.class);
 
   // read in attribute "file" of this plugin.
   private static String attributeFile = null;
@@ -71,8 +84,7 @@ public class DomainURLFilter
   private String domainFile = null;
   private Set<String> domainSet = new LinkedHashSet<String>();
 
-  private void readConfiguration(Reader configReader)
-    throws IOException {
+  private void readConfiguration(Reader configReader) throws IOException {
 
     // read the configuration file, line by line
     BufferedReader reader = new BufferedReader(configReader);
@@ -95,7 +107,8 @@ public class DomainURLFilter
   /**
    * Constructor that specifies the domain file to use.
    * 
-   * @param domainFile The domain file, overrides domain-urlfilter.text default.
+   * @param domainFile
+   *          The domain file, overrides domain-urlfilter.text default.
    * 
    * @throws IOException
    */
@@ -111,8 +124,8 @@ public class DomainURLFilter
 
     // get the extensions for domain urlfilter
     String pluginName = "urlfilter-domain";
-    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
-      URLFilter.class.getName()).getExtensions();
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLFilter.class.getName()).getExtensions();
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
       if (extension.getDescriptor().getPluginId().equals(pluginName)) {
@@ -120,32 +133,30 @@ public class DomainURLFilter
         break;
       }
     }
-    
+
     // handle blank non empty input
     if (attributeFile != null && attributeFile.trim().equals("")) {
       attributeFile = null;
     }
-    
+
     if (attributeFile != null) {
       if (LOG.isInfoEnabled()) {
         LOG.info("Attribute \"file\" is defined for plugin " + pluginName
-          + " as " + attributeFile);
+            + " as " + attributeFile);
       }
-    }
-    else {
+    } else {
       if (LOG.isWarnEnabled()) {
         LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
-          + pluginName);
+            + pluginName);
       }
     }
 
     // domain file and attribute "file" take precedence if defined
-    String file = conf.get("urlfilter.domain.file");    
+    String file = conf.get("urlfilter.domain.file");
     String stringRules = conf.get("urlfilter.domain.rules");
     if (domainFile != null) {
       file = domainFile;
-    }
-    else if (attributeFile != null) {
+    } else if (attributeFile != null) {
       file = attributeFile;
     }
     Reader reader = null;
@@ -159,8 +170,7 @@ public class DomainURLFilter
         reader = new FileReader(file);
       }
       readConfiguration(reader);
-    }
-    catch (IOException e) {
+    } catch (IOException e) {
       LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
     }
   }
@@ -173,7 +183,7 @@ public class DomainURLFilter
 
     try {
 
-      // match for suffix, domain, and host in that order.  more general will
+      // match for suffix, domain, and host in that order. more general will
       // override more specific
       String domain = URLUtil.getDomainName(url).toLowerCase().trim();
       String host = URLUtil.getHost(url);
@@ -182,20 +192,19 @@ public class DomainURLFilter
       if (domainSuffix != null) {
         suffix = domainSuffix.getDomain();
       }
-      
+
       if (domainSet.contains(suffix) || domainSet.contains(domain)
-        || domainSet.contains(host)) {
+          || domainSet.contains(host)) {
         return url;
       }
 
       // doesn't match, don't allow
       return null;
-    }
-    catch (Exception e) {
-      
+    } catch (Exception e) {
+
       // if an error happens, allow the url to pass
       LOG.error("Could not apply filter on url: " + url + "\n"
-        + org.apache.hadoop.util.StringUtils.stringifyException(e));
+          + org.apache.hadoop.util.StringUtils.stringifyException(e));
       return null;
     }
   }

Modified: nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Thu Jan 29 05:38:59 2015
@@ -23,13 +23,11 @@ import org.junit.Test;
 
 public class TestDomainURLFilter {
 
-
   private final static String SEPARATOR = System.getProperty("file.separator");
   private final static String SAMPLES = System.getProperty("test.data", ".");
 
   @Test
-  public void testFilter()
-    throws Exception {
+  public void testFilter() throws Exception {
 
     String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
     Configuration conf = NutchConfiguration.create();

Modified: nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java Thu Jan 29 05:38:59 2015
@@ -35,35 +35,48 @@ import org.apache.nutch.util.URLUtil;
 import org.apache.nutch.util.domain.DomainSuffix;
 
 /**
- * <p>Filters URLs based on a file containing domain suffixes, domain names, and
- * hostnames. A url that matches one of the suffixes, domains, or hosts
- * present in the file is filtered out.</p>
- * 
- * <p>Urls are checked in order of domain suffix, domain name, and hostname
- * against entries in the domain file. The domain file would be setup as follows
- * with one entry per line:
- * 
- * <pre> com apache.org www.apache.org </pre>
- * 
- * <p>The first line is an example of a filter that would allow all .com
- * domains. The second line allows all urls from apache.org and all of its
- * subdomains such as lucene.apache.org and hadoop.apache.org. The third line
- * would allow only urls from www.apache.org. There is no specific ordering to
- * entries. The entries are from more general to more specific with the more
- * general overridding the more specific.</p>
- * 
- * The domain file defaults to domainblacklist-urlfilter.txt in the classpath but can be
- * overridden using the:
- * 
- * <ul> <ol>property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and</ol>
- * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
+ * <p>
+ * Filters URLs based on a file containing domain suffixes, domain names, and
+ * hostnames. A url that matches one of the suffixes, domains, or hosts present
+ * in the file is filtered out.
+ * </p>
+ * 
+ * <p>
+ * Urls are checked in order of domain suffix, domain name, and hostname against
+ * entries in the domain file. The domain file would be setup as follows with
+ * one entry per line:
+ * 
+ * <pre>
+ * com apache.org www.apache.org
+ * </pre>
+ * 
+ * <p>
+ * The first line is an example of a filter that would allow all .com domains.
+ * The second line allows all urls from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would allow
+ * only urls from www.apache.org. There is no specific ordering to entries. The
+ * entries are from more general to more specific with the more general
+ * overridding the more specific.
+ * </p>
+ * 
+ * The domain file defaults to domainblacklist-urlfilter.txt in the classpath
+ * but can be overridden using the:
+ * 
+ * <ul>
+ * <ol>
+ * property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and
+ * </ol>
+ * <ol>
+ * attribute "file" in plugin.xml of this plugin
+ * </ol>
+ * </ul>
  * 
  * the attribute "file" has higher precedence if defined.
  */
-public class DomainBlacklistURLFilter
-  implements URLFilter {
+public class DomainBlacklistURLFilter implements URLFilter {
 
-  private static final Logger LOG = LoggerFactory.getLogger(DomainBlacklistURLFilter.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DomainBlacklistURLFilter.class);
 
   // read in attribute "file" of this plugin.
   private static String attributeFile = null;
@@ -71,8 +84,7 @@ public class DomainBlacklistURLFilter
   private String domainFile = null;
   private Set<String> domainSet = new LinkedHashSet<String>();
 
-  private void readConfiguration(Reader configReader)
-    throws IOException {
+  private void readConfiguration(Reader configReader) throws IOException {
 
     // read the configuration file, line by line
     BufferedReader reader = new BufferedReader(configReader);
@@ -95,7 +107,8 @@ public class DomainBlacklistURLFilter
   /**
    * Constructor that specifies the domain file to use.
    * 
-   * @param domainFile The domain file, overrides domainblacklist-urlfilter.text default.
+   * @param domainFile
+   *          The domain file, overrides domainblacklist-urlfilter.text default.
    * 
    * @throws IOException
    */
@@ -111,8 +124,8 @@ public class DomainBlacklistURLFilter
 
     // get the extensions for domain urlfilter
     String pluginName = "urlfilter-domainblacklist";
-    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
-      URLFilter.class.getName()).getExtensions();
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLFilter.class.getName()).getExtensions();
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
       if (extension.getDescriptor().getPluginId().equals(pluginName)) {
@@ -129,23 +142,21 @@ public class DomainBlacklistURLFilter
     if (attributeFile != null) {
       if (LOG.isInfoEnabled()) {
         LOG.info("Attribute \"file\" is defined for plugin " + pluginName
-          + " as " + attributeFile);
+            + " as " + attributeFile);
       }
-    }
-    else {
+    } else {
       if (LOG.isWarnEnabled()) {
         LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
-          + pluginName);
+            + pluginName);
       }
     }
 
     // domain file and attribute "file" take precedence if defined
-    String file = conf.get("urlfilter.domainblacklist.file");    
+    String file = conf.get("urlfilter.domainblacklist.file");
     String stringRules = conf.get("urlfilter.domainblacklist.rules");
     if (domainFile != null) {
       file = domainFile;
-    }
-    else if (attributeFile != null) {
+    } else if (attributeFile != null) {
       file = attributeFile;
     }
     Reader reader = null;
@@ -159,8 +170,7 @@ public class DomainBlacklistURLFilter
         reader = new FileReader(file);
       }
       readConfiguration(reader);
-    }
-    catch (IOException e) {
+    } catch (IOException e) {
       LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
     }
   }
@@ -171,7 +181,7 @@ public class DomainBlacklistURLFilter
 
   public String filter(String url) {
     try {
-      // match for suffix, domain, and host in that order.  more general will
+      // match for suffix, domain, and host in that order. more general will
       // override more specific
       String domain = URLUtil.getDomainName(url).toLowerCase().trim();
       String host = URLUtil.getHost(url);
@@ -182,19 +192,18 @@ public class DomainBlacklistURLFilter
       }
 
       if (domainSet.contains(suffix) || domainSet.contains(domain)
-        || domainSet.contains(host)) {
+          || domainSet.contains(host)) {
         // Matches, filter!
         return null;
       }
 
       // doesn't match, allow
       return url;
-    }
-    catch (Exception e) {
+    } catch (Exception e) {
 
       // if an error happens, allow the url to pass
       LOG.error("Could not apply filter on url: " + url + "\n"
-        + org.apache.hadoop.util.StringUtils.stringifyException(e));
+          + org.apache.hadoop.util.StringUtils.stringifyException(e));
       return null;
     }
   }

Modified: nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java Thu Jan 29 05:38:59 2015
@@ -27,12 +27,12 @@ public class TestDomainBlacklistURLFilte
   private final static String SAMPLES = System.getProperty("test.data", ".");
 
   @Test
-  public void testFilter()
-    throws Exception {
+  public void testFilter() throws Exception {
 
     String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
     Configuration conf = NutchConfiguration.create();
-    DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(domainBlacklistFile);
+    DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(
+        domainBlacklistFile);
     domainBlacklistFilter.setConf(conf);
     Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
     Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));

Modified: nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java Thu Jan 29 05:38:59 2015
@@ -39,16 +39,19 @@ import java.util.List;
 import java.util.ArrayList;
 
 /**
- * Filters URLs based on a file of URL prefixes. The file is named by
- * (1) property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and
- * (2) attribute "file" in plugin.xml of this plugin
- * Attribute "file" has higher precedence if defined.
- *
- * <p>The format of this file is one URL prefix per line.</p>
+ * Filters URLs based on a file of URL prefixes. The file is named by (1)
+ * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and (2)
+ * attribute "file" in plugin.xml of this plugin Attribute "file" has higher
+ * precedence if defined.
+ * 
+ * <p>
+ * The format of this file is one URL prefix per line.
+ * </p>
  */
 public class PrefixURLFilter implements URLFilter {
 
-  private static final Logger LOG = LoggerFactory.getLogger(PrefixURLFilter.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(PrefixURLFilter.class);
 
   // read in attribute "file" of this plugin.
   private static String attributeFile = null;
@@ -58,7 +61,7 @@ public class PrefixURLFilter implements
   private Configuration conf;
 
   public PrefixURLFilter() throws IOException {
-   
+
   }
 
   public PrefixURLFilter(String stringRules) throws IOException {
@@ -72,43 +75,43 @@ public class PrefixURLFilter implements
       return url;
   }
 
-  private TrieStringMatcher readConfiguration(Reader reader)
-    throws IOException {
-    
-    BufferedReader in=new BufferedReader(reader);
+  private TrieStringMatcher readConfiguration(Reader reader) throws IOException {
+
+    BufferedReader in = new BufferedReader(reader);
     List<String> urlprefixes = new ArrayList<String>();
     String line;
 
-    while((line=in.readLine())!=null) {
+    while ((line = in.readLine()) != null) {
       if (line.length() == 0)
         continue;
 
-      char first=line.charAt(0);
+      char first = line.charAt(0);
       switch (first) {
-      case ' ' : case '\n' : case '#' :           // skip blank & comment lines
+      case ' ':
+      case '\n':
+      case '#': // skip blank & comment lines
         continue;
-      default :
-	urlprefixes.add(line);
+      default:
+        urlprefixes.add(line);
       }
     }
 
     return new PrefixStringMatcher(urlprefixes);
   }
 
-  public static void main(String args[])
-    throws IOException {
-    
+  public static void main(String args[]) throws IOException {
+
     PrefixURLFilter filter;
     if (args.length >= 1)
       filter = new PrefixURLFilter(args[0]);
     else
       filter = new PrefixURLFilter();
-    
-    BufferedReader in=new BufferedReader(new InputStreamReader(System.in));
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
     String line;
-    while((line=in.readLine())!=null) {
-      String out=filter.filter(line);
-      if(out!=null) {
+    while ((line = in.readLine()) != null) {
+      String out = filter.filter(line);
+      if (out != null) {
         System.out.println(out);
       }
     }
@@ -118,8 +121,8 @@ public class PrefixURLFilter implements
     this.conf = conf;
 
     String pluginName = "urlfilter-prefix";
-    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
-        URLFilter.class.getName()).getExtensions();
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLFilter.class.getName()).getExtensions();
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
       if (extension.getDescriptor().getPluginId().equals(pluginName)) {
@@ -136,8 +139,8 @@ public class PrefixURLFilter implements
       }
     } else {
       // if (LOG.isWarnEnabled()) {
-      //   LOG.warn("Attribute \"file\" is not defined in plugin.xml for
-      //   plugin "+pluginName);
+      // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
+      // plugin "+pluginName);
       // }
     }
 
@@ -159,7 +162,9 @@ public class PrefixURLFilter implements
       try {
         trie = readConfiguration(reader);
       } catch (IOException e) {
-        if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
+        if (LOG.isErrorEnabled()) {
+          LOG.error(e.getMessage());
+        }
         // TODO mb@media-style.com: throw Exception? Because broken api.
         throw new RuntimeException(e.getMessage(), e);
       }
@@ -169,5 +174,5 @@ public class PrefixURLFilter implements
   public Configuration getConf() {
     return this.conf;
   }
-  
+
 }

Modified: nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java Thu Jan 29 05:38:59 2015
@@ -28,13 +28,12 @@ import org.apache.nutch.urlfilter.api.Re
 import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
 import org.apache.nutch.util.NutchConfiguration;
 
-
 /**
  * Filters URLs based on a file of regular expressions using the
  * {@link java.util.regex Java Regex implementation}.
  */
 public class RegexURLFilter extends RegexURLFilterBase {
-  
+
   public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file";
   public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules";
 
@@ -42,24 +41,23 @@ public class RegexURLFilter extends Rege
     super();
   }
 
-  public RegexURLFilter(String filename)
-    throws IOException, PatternSyntaxException {
+  public RegexURLFilter(String filename) throws IOException,
+      PatternSyntaxException {
     super(filename);
   }
 
-  RegexURLFilter(Reader reader)
-    throws IOException, IllegalArgumentException {
+  RegexURLFilter(Reader reader) throws IOException, IllegalArgumentException {
     super(reader);
   }
 
-  
-  /* ----------------------------------- *
-   * <implementation:RegexURLFilterBase> *
-   * ----------------------------------- */
-  
+  /*
+   * ----------------------------------- * <implementation:RegexURLFilterBase> *
+   * -----------------------------------
+   */
+
   /**
-   * Rules specified as a config property will override rules specified
-   * as a config file.
+   * Rules specified as a config property will override rules specified as a
+   * config file.
    */
   protected Reader getRulesReader(Configuration conf) throws IOException {
     String stringRules = conf.get(URLFILTER_REGEX_RULES);
@@ -74,23 +72,22 @@ public class RegexURLFilter extends Rege
   protected RegexRule createRule(boolean sign, String regex) {
     return new Rule(sign, regex);
   }
-  
-  /* ------------------------------------ *
-   * </implementation:RegexURLFilterBase> *
-   * ------------------------------------ */
 
-  
+  /*
+   * ------------------------------------ * </implementation:RegexURLFilterBase>
+   * * ------------------------------------
+   */
+
   public static void main(String args[]) throws IOException {
     RegexURLFilter filter = new RegexURLFilter();
     filter.setConf(NutchConfiguration.create());
     main(filter, args);
   }
 
-
   private class Rule extends RegexRule {
-    
+
     private Pattern pattern;
-    
+
     Rule(boolean sign, String regex) {
       super(sign, regex);
       pattern = Pattern.compile(regex);
@@ -100,5 +97,5 @@ public class RegexURLFilter extends Rege
       return pattern.matcher(url).find();
     }
   }
-  
+
 }

Modified: nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java Thu Jan 29 05:38:59 2015
@@ -26,15 +26,13 @@ import org.apache.nutch.urlfilter.api.Re
 import org.junit.Assert;
 import org.junit.Test;
 
-
 /**
  * JUnit based test of class <code>RegexURLFilter</code>.
- *
+ * 
  * @author J&eacute;r&ocirc;me Charron
  */
 public class TestRegexURLFilter extends RegexURLFilterBaseTest {
 
-
   protected URLFilter getURLFilter(Reader rules) {
     try {
       return new RegexURLFilter(rules);
@@ -43,7 +41,7 @@ public class TestRegexURLFilter extends
       return null;
     }
   }
-  
+
   @Test
   public void test() {
     test("WholeWebCrawling");

Modified: nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java Thu Jan 29 05:38:59 2015
@@ -51,14 +51,15 @@ import java.net.MalformedURLException;
  * Attribute "file" has higher precedence if defined. If the config file is
  * missing, all URLs will be rejected.
  * 
- * <p>This filter can be configured to work in one of two modes:
+ * <p>
+ * This filter can be configured to work in one of two modes:
  * <ul>
- * <li><b>default to reject</b> ('-'): in this mode, only URLs that match suffixes
- * specified in the config file will be accepted, all other URLs will be
- * rejected.</li>
- * <li><b>default to accept</b> ('+'): in this mode, only URLs that match suffixes
- * specified in the config file will be rejected, all other URLs will be
- * accepted.</li>
+ * <li><b>default to reject</b> ('-'): in this mode, only URLs that match
+ * suffixes specified in the config file will be accepted, all other URLs will
+ * be rejected.</li>
+ * <li><b>default to accept</b> ('+'): in this mode, only URLs that match
+ * suffixes specified in the config file will be rejected, all other URLs will
+ * be accepted.</li>
  * </ul>
  * <p>
  * The format of this config file is one URL suffix per line, with no preceding
@@ -67,10 +68,10 @@ import java.net.MalformedURLException;
  * </p>
  * <p>
  * A single '+' or '-' sign not followed by any suffix must be used once, to
- * signify the mode this plugin operates in. An optional single 'I' can be appended,
- * to signify that suffix matches should be case-insensitive. The default, if 
- * not specified, is to use case-sensitive matches, i.e. suffix '.JPG'
- * does not match '.jpg'.
+ * signify the mode this plugin operates in. An optional single 'I' can be
+ * appended, to signify that suffix matches should be case-insensitive. The
+ * default, if not specified, is to use case-sensitive matches, i.e. suffix
+ * '.JPG' does not match '.jpg'.
  * </p>
  * <p>
  * NOTE: the format of this file is different from urlfilter-prefix, because
@@ -82,8 +83,8 @@ import java.net.MalformedURLException;
  * <h4>Example 1</h4>
  * <p>
  * The configuration shown below will accept all URLs with '.html' or '.htm'
- * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected),
- * and prohibit all other suffixes.
+ * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit
+ * all other suffixes.
  * <p>
  * 
  * <pre>
@@ -91,7 +92,7 @@ import java.net.MalformedURLException;
  *  
  *  # prohibit all unknown, case-sensitive matching
  *  -
- *
+ * 
  *  # collect only HTML files.
  *  .html
  *  .htm
@@ -119,11 +120,13 @@ import java.net.MalformedURLException;
  * </pre>
  * 
  * </p>
+ * 
  * @author Andrzej Bialecki
  */
 public class SuffixURLFilter implements URLFilter {
 
-  private static final Logger LOG = LoggerFactory.getLogger(SuffixURLFilter.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(SuffixURLFilter.class);
 
   // read in attribute "file" of this plugin.
   private String attributeFile = null;
@@ -144,11 +147,13 @@ public class SuffixURLFilter implements
   }
 
   public String filter(String url) {
-    if (url == null) return null;
+    if (url == null)
+      return null;
     String _url;
     if (ignoreCase)
       _url = url.toLowerCase();
-    else _url = url;
+    else
+      _url = url;
     if (filterFromPath) {
       try {
         URL pUrl = new URL(_url);
@@ -160,11 +165,15 @@ public class SuffixURLFilter implements
 
     String a = suffixes.shortestMatch(_url);
     if (a == null) {
-      if (modeAccept) return url;
-      else return null;
+      if (modeAccept)
+        return url;
+      else
+        return null;
     } else {
-      if (modeAccept) return null;
-      else return url;
+      if (modeAccept)
+        return null;
+      else
+        return url;
     }
   }
 
@@ -187,30 +196,31 @@ public class SuffixURLFilter implements
     String line;
 
     while ((line = in.readLine()) != null) {
-      if (line.length() == 0) continue;
+      if (line.length() == 0)
+        continue;
 
       char first = line.charAt(0);
       switch (first) {
-        case ' ':
-        case '\n':
-        case '#': // skip blank & comment lines
-          break;
-        case '-':
-          allow = false;
-          if(line.contains("P"))
-            filterFromPath = true;
-          if(line.contains("I"))
-            ignore = true;
-          break;
-        case '+':
-          allow = true;
-          if(line.contains("P"))
-            filterFromPath = true;
-          if(line.contains("I"))
-            ignore = true;
-          break;
-        default:
-          aSuffixes.add(line);
+      case ' ':
+      case '\n':
+      case '#': // skip blank & comment lines
+        break;
+      case '-':
+        allow = false;
+        if (line.contains("P"))
+          filterFromPath = true;
+        if (line.contains("I"))
+          ignore = true;
+        break;
+      case '+':
+        allow = true;
+        if (line.contains("P"))
+          filterFromPath = true;
+        if (line.contains("I"))
+          ignore = true;
+        break;
+      default:
+        aSuffixes.add(line);
       }
     }
     if (ignore) {
@@ -249,7 +259,8 @@ public class SuffixURLFilter implements
     this.conf = conf;
 
     String pluginName = "urlfilter-suffix";
-    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(URLFilter.class.getName()).getExtensions();
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLFilter.class.getName()).getExtensions();
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
       if (extension.getDescriptor().getPluginId().equals(pluginName)) {
@@ -257,22 +268,25 @@ public class SuffixURLFilter implements
         break;
       }
     }
-    if (attributeFile != null && attributeFile.trim().equals("")) attributeFile = null;
+    if (attributeFile != null && attributeFile.trim().equals(""))
+      attributeFile = null;
     if (attributeFile != null) {
       if (LOG.isInfoEnabled()) {
-        LOG.info("Attribute \"file\" is defined for plugin " + pluginName + " as " + attributeFile);
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+            + " as " + attributeFile);
       }
     } else {
       // if (LOG.isWarnEnabled()) {
-      //   LOG.warn("Attribute \"file\" is not defined in plugin.xml for
-      //   plugin "+pluginName);
+      // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
+      // plugin "+pluginName);
       // }
     }
 
     String file = conf.get("urlfilter.suffix.file");
     String stringRules = conf.get("urlfilter.suffix.rules");
     // attribute "file" takes precedence if defined
-    if (attributeFile != null) file = attributeFile;
+    if (attributeFile != null)
+      file = attributeFile;
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
@@ -283,7 +297,9 @@ public class SuffixURLFilter implements
     try {
       readConfiguration(reader);
     } catch (IOException e) {
-      if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.getMessage());
+      }
       throw new RuntimeException(e.getMessage(), e);
     }
   }

Modified: nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java Thu Jan 29 05:38:59 2015
@@ -25,101 +25,45 @@ import org.junit.Test;
 
 /**
  * JUnit test for <code>SuffixURLFilter</code>.
- *
+ * 
  * @author Andrzej Bialecki
  */
 public class TestSuffixURLFilter {
-  private static final String suffixes =
-    "# this is a comment\n" +
-    "\n" +
-    ".gif\n" +
-    ".jpg\n" +
-    ".js\n";
-  
+  private static final String suffixes = "# this is a comment\n" + "\n"
+      + ".gif\n" + ".jpg\n" + ".js\n";
+
   private static final String[] urls = new String[] {
-    "http://www.example.com/test.gif",
-    "http://www.example.com/TEST.GIF",
-    "http://www.example.com/test.jpg",
-    "http://www.example.com/test.JPG",
-    "http://www.example.com/test.html",
-    "http://www.example.com/test.HTML",
-    "http://www.example.com/test.html?q=abc.js",
-    "http://www.example.com/test.js?foo=bar&baz=bar#12333",
-  };
-  
-  private static String[] urlsModeAccept = new String[] {
-    null,
-    urls[1],
-    null,
-    urls[3],
-    urls[4],
-    urls[5],
-    null,
-    urls[7]
-  };
-  
-  private static String[] urlsModeReject = new String[] {
-    urls[0],
-    null,
-    urls[2],
-    null,
-    null,
-    null,
-    urls[6],
-    null
-  };
-  
-  private static String[] urlsModeAcceptIgnoreCase = new String[] {
-    null,
-    null,
-    null,
-    null,
-    urls[4],
-    urls[5],
-    null,
-    urls[7]
-  };
- 
-  private static String[] urlsModeRejectIgnoreCase = new String[] {
-    urls[0],
-    urls[1],
-    urls[2],
-    urls[3],
-    null,
-    null,
-    urls[6],
-    null
-  };
-  
-  private static String[] urlsModeAcceptAndPathFilter = new String[] {
-    null,
-    urls[1],
-    null,
-    urls[3],
-    urls[4],
-    urls[5],
-    urls[6],
-    null
-  };
-  
-  private static String[] urlsModeAcceptAndNonPathFilter = new String[] {
-    null,
-    urls[1],
-    null,
-    urls[3],
-    urls[4],
-    urls[5],
-    null,
-    urls[7]
-  };
-  
+      "http://www.example.com/test.gif", "http://www.example.com/TEST.GIF",
+      "http://www.example.com/test.jpg", "http://www.example.com/test.JPG",
+      "http://www.example.com/test.html", "http://www.example.com/test.HTML",
+      "http://www.example.com/test.html?q=abc.js",
+      "http://www.example.com/test.js?foo=bar&baz=bar#12333", };
+
+  private static String[] urlsModeAccept = new String[] { null, urls[1], null,
+      urls[3], urls[4], urls[5], null, urls[7] };
+
+  private static String[] urlsModeReject = new String[] { urls[0], null,
+      urls[2], null, null, null, urls[6], null };
+
+  private static String[] urlsModeAcceptIgnoreCase = new String[] { null, null,
+      null, null, urls[4], urls[5], null, urls[7] };
+
+  private static String[] urlsModeRejectIgnoreCase = new String[] { urls[0],
+      urls[1], urls[2], urls[3], null, null, urls[6], null };
+
+  private static String[] urlsModeAcceptAndPathFilter = new String[] { null,
+      urls[1], null, urls[3], urls[4], urls[5], urls[6], null };
+
+  private static String[] urlsModeAcceptAndNonPathFilter = new String[] { null,
+      urls[1], null, urls[3], urls[4], urls[5], null, urls[7] };
+
   private SuffixURLFilter filter = null;
-  
+
   @Before
   public void setUp() throws IOException {
     filter = new SuffixURLFilter(new StringReader(suffixes));
   }
-  
+
   @Test
   public void testModeAccept() {
     filter.setIgnoreCase(false);
@@ -155,22 +99,24 @@ public class TestSuffixURLFilter {
       Assert.assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i]));
     }
   }
-  
+
   @Test
   public void testModeAcceptAndNonPathFilter() {
     filter.setModeAccept(true);
     filter.setFilterFromPath(false);
     for (int i = 0; i < urls.length; i++) {
-      Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter.filter(urls[i]));
+      Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter
+          .filter(urls[i]));
     }
   }
-  
+
   @Test
   public void testModeAcceptAndPathFilter() {
     filter.setModeAccept(true);
     filter.setFilterFromPath(true);
     for (int i = 0; i < urls.length; i++) {
-      Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter.filter(urls[i]));
+      Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter
+          .filter(urls[i]));
     }
   }
 

Modified: nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java (original)
+++ nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java Thu Jan 29 05:38:59 2015
@@ -23,12 +23,16 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.nutch.net.URLFilter;
 
 /**
- * <p>Validates URLs.</p>
- *
- * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b,
- * Date: 03/07/02,
- * http://javascript.internet.com. However, this validation now bears little
- * resemblance to the php original.</p>
+ * <p>
+ * Validates URLs.
+ * </p>
+ * 
+ * <p>
+ * Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date:
+ * 03/07/02, http://javascript.internet.com. However, this validation now bears
+ * little resemblance to the php original.
+ * </p>
+ * 
  * <pre>
  *   Example of usage:
  *    UrlValidator urlValidator = UrlValidator.get();
@@ -37,17 +41,17 @@ import org.apache.nutch.net.URLFilter;
  *    } else {
  *       System.out.println("url is invalid");
  *    }
- *
+ * 
  *   prints out "url is valid"
- *  </pre>
- *
- * <p>Based on UrlValidator code from Apache commons-validator.</p>
- *
- * @see
- * <a href='http://www.ietf.org/rfc/rfc2396.txt' >
- *  Uniform Resource Identifiers (URI): Generic Syntax
- * </a>
- *
+ * </pre>
+ * 
+ * <p>
+ * Based on UrlValidator code from Apache commons-validator.
+ * </p>
+ * 
+ * @see <a href='http://www.ietf.org/rfc/rfc2396.txt' > Uniform Resource
+ *      Identifiers (URI): Generic Syntax </a>
+ * 
  */
 public class UrlValidator implements URLFilter {
 
@@ -61,7 +65,7 @@ public class UrlValidator implements URL
 
   private static final String SCHEME_CHARS = ALPHA_CHARS;
 
-  // Drop numeric, and  "+-." for now
+  // Drop numeric, and "+-." for now
   private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\.";
 
   private static final String ATOM = VALID_CHARS + '+';
@@ -69,9 +73,9 @@ public class UrlValidator implements URL
   /**
    * This expression derived/taken from the BNF for URI (RFC2396).
    */
-  private static final Pattern URL_PATTERN =
-    Pattern.compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)" +
-                    "(\\?([^#]*))?(#(.*))?");
+  private static final Pattern URL_PATTERN = Pattern
+      .compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)"
+          + "(\\?([^#]*))?(#(.*))?");
 
   /**
    * Schema/Protocol (ie. http:, ftp:, file:, etc).
@@ -90,11 +94,11 @@ public class UrlValidator implements URL
   /**
    * Protocol (ie. http:, ftp:,https:).
    */
-  private static final Pattern SCHEME_PATTERN =
-    Pattern.compile("^[" + SCHEME_CHARS + "]+");
+  private static final Pattern SCHEME_PATTERN = Pattern.compile("^["
+      + SCHEME_CHARS + "]+");
 
-  private static final Pattern AUTHORITY_PATTERN =
-    Pattern.compile("^([" + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?");
+  private static final Pattern AUTHORITY_PATTERN = Pattern.compile("^(["
+      + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?");
 
   private static final int PARSE_AUTHORITY_HOST_IP = 1;
 
@@ -105,28 +109,26 @@ public class UrlValidator implements URL
    */
   private static final int PARSE_AUTHORITY_EXTRA = 3;
 
-  private static final Pattern PATH_PATTERN =
-    Pattern.compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$");
+  private static final Pattern PATH_PATTERN = Pattern
+      .compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$");
 
   private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$");
 
-  private static final Pattern LEGAL_ASCII_PATTERN =
-    Pattern.compile("^[\\x21-\\x7E]+$");
+  private static final Pattern LEGAL_ASCII_PATTERN = Pattern
+      .compile("^[\\x21-\\x7E]+$");
 
-  private static final Pattern IP_V4_DOMAIN_PATTERN =
-    Pattern.compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$");
+  private static final Pattern IP_V4_DOMAIN_PATTERN = Pattern
+      .compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$");
 
-  private static final Pattern DOMAIN_PATTERN =
-    Pattern.compile("^" + ATOM + "(\\." + ATOM + ")*$");
+  private static final Pattern DOMAIN_PATTERN = Pattern.compile("^" + ATOM
+      + "(\\." + ATOM + ")*$");
 
-  private static final Pattern PORT_PATTERN =
-    Pattern.compile("^:(\\d{1,5})$");
+  private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$");
 
-  private static final Pattern ATOM_PATTERN =
-    Pattern.compile("(" + ATOM + ")");
+  private static final Pattern ATOM_PATTERN = Pattern.compile("(" + ATOM + ")");
 
-  private static final Pattern ALPHA_PATTERN =
-    Pattern.compile("^[" + ALPHA_CHARS + "]");
+  private static final Pattern ALPHA_PATTERN = Pattern.compile("^["
+      + ALPHA_CHARS + "]");
 
   private Configuration conf;
 
@@ -143,10 +145,13 @@ public class UrlValidator implements URL
   }
 
   /**
-   * <p>Checks if a field has a valid url address.</p>
-   *
-   * @param value The value validation is being performed on.
-   * A <code>null</code> value is considered invalid.
+   * <p>
+   * Checks if a field has a valid url address.
+   * </p>
+   * 
+   * @param value
+   *          The value validation is being performed on. A <code>null</code>
+   *          value is considered invalid.
    * @return true if the url is valid.
    */
   private boolean isValid(String value) {
@@ -184,11 +189,13 @@ public class UrlValidator implements URL
   }
 
   /**
-   * Validate scheme. If schemes[] was initialized to a non null,
-   * then only those scheme's are allowed.  Note this is slightly different
-   * than for the constructor.
-   * @param scheme The scheme to validate.  A <code>null</code> value is
-   * considered invalid.
+   * Validate scheme. If schemes[] was initialized to a non null, then only
+   * those scheme's are allowed. Note this is slightly different than for the
+   * constructor.
+   * 
+   * @param scheme
+   *          The scheme to validate. A <code>null</code> value is considered
+   *          invalid.
    * @return true if valid.
    */
   private boolean isValidScheme(String scheme) {
@@ -200,10 +207,12 @@ public class UrlValidator implements URL
   }
 
   /**
-   * Returns true if the authority is properly formatted.  An authority is
-   * the combination of hostname and port.  A <code>null</code> authority
-   * value is considered invalid.
-   * @param authority Authority value to validate.
+   * Returns true if the authority is properly formatted. An authority is the
+   * combination of hostname and port. A <code>null</code> authority value is
+   * considered invalid.
+   * 
+   * @param authority
+   *          Authority value to validate.
    * @return true if authority (hostname and port) is valid.
    */
   private boolean isValidAuthority(String authority) {
@@ -235,7 +244,7 @@ public class UrlValidator implements URL
           if (Integer.parseInt(ipSegment) > 255) {
             return false;
           }
-        } catch(NumberFormatException e) {
+        } catch (NumberFormatException e) {
           return false;
         }
 
@@ -251,8 +260,8 @@ public class UrlValidator implements URL
       // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
       char[] chars = hostIP.toCharArray();
       int size = 1;
-      for(int i=0; i<chars.length; i++) {
-        if(chars[i] == '.') {
+      for (int i = 0; i < chars.length; i++) {
+        if (chars[i] == '.') {
           size++;
         }
       }
@@ -264,8 +273,7 @@ public class UrlValidator implements URL
       while (atomMatcher.find()) {
         domainSegment[segCount] = atomMatcher.group();
         segLen = domainSegment[segCount].length() + 1;
-        hostIP = (segLen >= hostIP.length()) ? ""
-                                             : hostIP.substring(segLen);
+        hostIP = (segLen >= hostIP.length()) ? "" : hostIP.substring(segLen);
         segCount++;
       }
       String topLevel = domainSegment[segCount - 1];
@@ -300,10 +308,13 @@ public class UrlValidator implements URL
   }
 
   /**
-   * <p>Checks if the field isn't null and length of the field is greater
-   * than zero not including whitespace.</p>
-   *
-   * @param value The value validation is being performed on.
+   * <p>
+   * Checks if the field isn't null and length of the field is greater than zero
+   * not including whitespace.
+   * </p>
+   * 
+   * @param value
+   *          The value validation is being performed on.
    * @return true if blank or null.
    */
   private boolean isBlankOrNull(String value) {
@@ -311,9 +322,11 @@ public class UrlValidator implements URL
   }
 
   /**
-   * Returns true if the path is valid.  A <code>null</code> value is
-   * considered invalid.
-   * @param path Path value to validate.
+   * Returns true if the path is valid. A <code>null</code> value is considered
+   * invalid.
+   * 
+   * @param path
+   *          Path value to validate.
    * @return true if path is valid.
    */
   private boolean isValidPath(String path) {
@@ -335,7 +348,9 @@ public class UrlValidator implements URL
   /**
    * Returns true if the query is null or it's a properly formatted query
    * string.
-   * @param query Query value to validate.
+   * 
+   * @param query
+   *          Query value to validate.
    * @return true if query is valid.
    */
   private boolean isValidQuery(String query) {
@@ -348,8 +363,11 @@ public class UrlValidator implements URL
 
   /**
    * Returns the number of times the token appears in the target.
-   * @param token Token value to be counted.
-   * @param target Target value to count tokens in.
+   * 
+   * @param token
+   *          Token value to be counted.
+   * @param target
+   *          Target value to count tokens in.
    * @return the number of tokens.
    */
   private int countToken(String token, String target) {

Modified: nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java (original)
+++ nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java Thu Jan 29 05:38:59 2015
@@ -21,40 +21,59 @@ import org.junit.Assert;
 import org.junit.Test;
 
 /**
- * JUnit test case which tests 
- * 1. that valid urls are not filtered while invalid ones are filtered.
- * 2. that Urls' scheme, authority, path and query are validated.
+ * JUnit test case which tests 1. that valid urls are not filtered while invalid
+ * ones are filtered. 2. that Urls' scheme, authority, path and query are
+ * validated.
  * 
  * @author tejasp
- *
+ * 
  */
 
 public class TestUrlValidator {
 
   /**
-   * Test method for {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)}.
+   * Test method for
+   * {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)}
+   * .
    */
   @Test
   public void testFilter() {
     UrlValidator url_validator = new UrlValidator();
     Assert.assertNotNull(url_validator);
 
-    Assert.assertNull("Filtering on a null object should return null", url_validator.filter(null));
-    Assert.assertNull("Invalid url: example.com/file[/].html", url_validator.filter("example.com/file[/].html"));
-    Assert.assertNull("Invalid url: http://www.example.com/space here.html", url_validator.filter("http://www.example.com/space here.html"));
-    Assert.assertNull("Invalid url: /main.html", url_validator.filter("/main.html"));
-    Assert.assertNull("Invalid url: www.example.com/main.html", url_validator.filter("www.example.com/main.html"));
-    Assert.assertNull("Invalid url: ftp:www.example.com/main.html", url_validator.filter("ftp:www.example.com/main.html"));
-    Assert.assertNull("Inalid url: http://999.000.456.32/nutch/trunk/README.txt", 
+    Assert.assertNull("Filtering on a null object should return null",
+        url_validator.filter(null));
+    Assert.assertNull("Invalid url: example.com/file[/].html",
+        url_validator.filter("example.com/file[/].html"));
+    Assert.assertNull("Invalid url: http://www.example.com/space here.html",
+        url_validator.filter("http://www.example.com/space here.html"));
+    Assert.assertNull("Invalid url: /main.html",
+        url_validator.filter("/main.html"));
+    Assert.assertNull("Invalid url: www.example.com/main.html",
+        url_validator.filter("www.example.com/main.html"));
+    Assert.assertNull("Invalid url: ftp:www.example.com/main.html",
+        url_validator.filter("ftp:www.example.com/main.html"));
+    Assert.assertNull(
+        "Inalid url: http://999.000.456.32/nutch/trunk/README.txt",
         url_validator.filter("http://999.000.456.32/nutch/trunk/README.txt"));
-    Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html", url_validator.filter(" http://www.example.com/ma|in\\toc.html"));
+    Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html",
+        url_validator.filter(" http://www.example.com/ma|in\\toc.html"));
 
-    Assert.assertNotNull("Valid url: https://issues.apache.org/jira/NUTCH-1127", url_validator.filter("https://issues.apache.org/jira/NUTCH-1127"));
-    Assert.assertNotNull("Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&amp;name=Fonzi&amp;mood=happy&amp;coat=leather", 
-        url_validator.filter("http://domain.tld/function.cgi?url=http://fonzi.com/&amp;name=Fonzi&amp;mood=happy&amp;coat=leather"));
-    Assert.assertNotNull("Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress", 
-        url_validator.filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress"));
-    Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf", url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf"));
+    Assert.assertNotNull(
+        "Valid url: https://issues.apache.org/jira/NUTCH-1127",
+        url_validator.filter("https://issues.apache.org/jira/NUTCH-1127"));
+    Assert
+        .assertNotNull(
+            "Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&amp;name=Fonzi&amp;mood=happy&amp;coat=leather",
+            url_validator
+                .filter("http://domain.tld/function.cgi?url=http://fonzi.com/&amp;name=Fonzi&amp;mood=happy&amp;coat=leather"));
+    Assert
+        .assertNotNull(
+            "Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress",
+            url_validator
+                .filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress"));
+    Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf",
+        url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf"));
 
   }
 }