You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC
svn commit: r1655526 [25/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...

Modified: nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java Thu Jan 29 05:38:59 2015
@@ -35,19 +35,19 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * New SegmentMerger unit test focusing on several crappy issues with the segment
- * merger. The general problem is disappearing records and incorrect CrawlDatum
- * status values. This unit test performs random sequences of segment merging where
- * we're looking for an expected status.
- * A second test is able to randomly inject redirects in segment, likely causing
- * the segment merger to fail resulting in a bad merged segment.
- *
+ * New SegmentMerger unit test focusing on several crappy issues with the
+ * segment merger. The general problem is disappearing records and incorrect
+ * CrawlDatum status values. This unit test performs random sequences of segment
+ * merging where we're looking for an expected status. A second test is able to
+ * randomly inject redirects in segment, likely causing the segment merger to
+ * fail resulting in a bad merged segment.
+ * 
  * See also:
- *
+ * 
  * https://issues.apache.org/jira/browse/NUTCH-1113
  * https://issues.apache.org/jira/browse/NUTCH-1616
  * https://issues.apache.org/jira/browse/NUTCH-1520
- *
+ * 
  * Cheers!
  */
 public class TestSegmentMergerCrawlDatums {
@@ -57,14 +57,14 @@ public class TestSegmentMergerCrawlDatum
 
   private static final Logger LOG = LoggerFactory
       .getLogger(TestSegmentMergerCrawlDatums.class);
-  
+
   @Before
   public void setUp() throws Exception {
     conf = NutchConfiguration.create();
     fs = FileSystem.get(conf);
     rnd = new Random();
   }
-  
+
   /**
    *
    */
@@ -75,15 +75,16 @@ public class TestSegmentMergerCrawlDatum
         new Byte(executeSequence(CrawlDatum.STATUS_FETCH_GONE,
             CrawlDatum.STATUS_FETCH_SUCCESS, 256, false)));
   }
-  
+
   /**
    *
    */
   @Test
   public void testMostlyRedirects() throws Exception {
     // Our test directory
-    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + System.currentTimeMillis());
-    
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
     Path segment1 = new Path(testDir, "20140110114943");
     Path segment2 = new Path(testDir, "20140110114832");
     Path segment3 = new Path(testDir, "20140110114558");
@@ -92,7 +93,7 @@ public class TestSegmentMergerCrawlDatum
     Path segment6 = new Path(testDir, "20140110114507");
     Path segment7 = new Path(testDir, "20140110114903");
     Path segment8 = new Path(testDir, "20140110114724");
-    
+
     createSegment(segment1, CrawlDatum.STATUS_FETCH_SUCCESS, true);
     createSegment(segment2, CrawlDatum.STATUS_FETCH_SUCCESS, true);
     createSegment(segment3, CrawlDatum.STATUS_FETCH_SUCCESS, true);
@@ -101,32 +102,33 @@ public class TestSegmentMergerCrawlDatum
     createSegment(segment6, CrawlDatum.STATUS_FETCH_SUCCESS, false);
     createSegment(segment7, CrawlDatum.STATUS_FETCH_SUCCESS, true);
     createSegment(segment8, CrawlDatum.STATUS_FETCH_SUCCESS, true);
-    
+
     // Merge the segments and get status
-    Path mergedSegment = merge(testDir, new Path[]{segment1, segment2, segment3, segment4, segment5, segment6, segment7, segment8});
+    Path mergedSegment = merge(testDir, new Path[] { segment1, segment2,
+        segment3, segment4, segment5, segment6, segment7, segment8 });
     Byte status = new Byte(status = checkMergedSegment(testDir, mergedSegment));
-    
+
     Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
   }
-  
+
   /**
    *
    */
   @Test
   public void testRandomizedSequences() throws Exception {
     for (int i = 0; i < rnd.nextInt(16) + 16; i++) {
-      byte expectedStatus = (byte)(rnd.nextInt(6) + 0x21);
+      byte expectedStatus = (byte) (rnd.nextInt(6) + 0x21);
       while (expectedStatus == CrawlDatum.STATUS_FETCH_RETRY
           || expectedStatus == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
         // fetch_retry and fetch_notmodified never remain in a merged segment
         expectedStatus = (byte) (rnd.nextInt(6) + 0x21);
       }
-      byte randomStatus = (byte)(rnd.nextInt(6) + 0x21);
+      byte randomStatus = (byte) (rnd.nextInt(6) + 0x21);
       int rounds = rnd.nextInt(16) + 32;
       boolean withRedirects = rnd.nextBoolean();
-      
-      byte resultStatus = executeSequence(randomStatus, expectedStatus,
-          rounds, withRedirects);
+
+      byte resultStatus = executeSequence(randomStatus, expectedStatus, rounds,
+          withRedirects);
       Assert.assertEquals(
           "Expected status = " + CrawlDatum.getStatusName(expectedStatus)
               + ", but got " + CrawlDatum.getStatusName(resultStatus)
@@ -135,196 +137,225 @@ public class TestSegmentMergerCrawlDatum
           resultStatus);
     }
   }
-  
+
   /**
    *
    */
   @Test
   public void testRandomTestSequenceWithRedirects() throws Exception {
-    Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), new Byte(executeSequence(CrawlDatum.STATUS_FETCH_GONE, CrawlDatum.STATUS_FETCH_SUCCESS, 128, true)));
+    Assert.assertEquals(
+        new Byte(CrawlDatum.STATUS_FETCH_SUCCESS),
+        new Byte(executeSequence(CrawlDatum.STATUS_FETCH_GONE,
+            CrawlDatum.STATUS_FETCH_SUCCESS, 128, true)));
   }
-  
+
   /**
    * Check a fixed sequence!
    */
   @Test
   public void testFixedSequence() throws Exception {
     // Our test directory
-    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + System.currentTimeMillis());
-    
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
     Path segment1 = new Path(testDir, "00001");
     Path segment2 = new Path(testDir, "00002");
     Path segment3 = new Path(testDir, "00003");
-    
+
     createSegment(segment1, CrawlDatum.STATUS_FETCH_GONE, false);
     createSegment(segment2, CrawlDatum.STATUS_FETCH_GONE, true);
     createSegment(segment3, CrawlDatum.STATUS_FETCH_SUCCESS, false);
-    
+
     // Merge the segments and get status
-    Path mergedSegment = merge(testDir, new Path[]{segment1, segment2, segment3});
+    Path mergedSegment = merge(testDir, new Path[] { segment1, segment2,
+        segment3 });
     Byte status = new Byte(status = checkMergedSegment(testDir, mergedSegment));
-    
+
     Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
   }
-  
+
   /**
    * Check a fixed sequence!
    */
   @Test
   public void testRedirFetchInOneSegment() throws Exception {
     // Our test directory
-    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + System.currentTimeMillis());
-    
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
     Path segment = new Path(testDir, "00001");
-    
+
     createSegment(segment, CrawlDatum.STATUS_FETCH_SUCCESS, true, true);
-    
+
     // Merge the segments and get status
-    Path mergedSegment = merge(testDir, new Path[]{segment});
+    Path mergedSegment = merge(testDir, new Path[] { segment });
     Byte status = new Byte(status = checkMergedSegment(testDir, mergedSegment));
-    
+
     Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
   }
-  
+
   /**
    * Check a fixed sequence!
    */
   @Test
   public void testEndsWithRedirect() throws Exception {
     // Our test directory
-    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + System.currentTimeMillis());
-    
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
     Path segment1 = new Path(testDir, "00001");
     Path segment2 = new Path(testDir, "00002");
-    
+
     createSegment(segment1, CrawlDatum.STATUS_FETCH_SUCCESS, false);
     createSegment(segment2, CrawlDatum.STATUS_FETCH_SUCCESS, true);
-    
+
     // Merge the segments and get status
-    Path mergedSegment = merge(testDir, new Path[]{segment1, segment2});
+    Path mergedSegment = merge(testDir, new Path[] { segment1, segment2 });
     Byte status = new Byte(status = checkMergedSegment(testDir, mergedSegment));
-    
+
     Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
   }
-  
+
   /**
-   * Execute a sequence of creating segments, merging them and checking the final output
-   *
-   * @param status to start with
-   * @param status to end with
-   * @param number of rounds
-   * @param whether redirects are injected randomly
+   * Execute a sequence of creating segments, merging them and checking the
+   * final output
+   * 
+   * @param status
+   *          to start with
+   * @param status
+   *          to end with
+   * @param number
+   *          of rounds
+   * @param whether
+   *          redirects are injected randomly
    * @return the CrawlDatum status
    */
-  protected byte executeSequence(byte firstStatus, byte lastStatus, int rounds, boolean redirect) throws Exception {
+  protected byte executeSequence(byte firstStatus, byte lastStatus, int rounds,
+      boolean redirect) throws Exception {
     // Our test directory
-    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + System.currentTimeMillis());
-    
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
     // Format for the segments
     DecimalFormat df = new DecimalFormat("0000000");
-    
+
     // Create our segment paths
     Path[] segmentPaths = new Path[rounds];
     for (int i = 0; i < rounds; i++) {
       String segmentName = df.format(i);
       segmentPaths[i] = new Path(testDir, segmentName);
     }
-       
+
     // Create the first segment according to the specified status
     createSegment(segmentPaths[0], firstStatus, false);
-    
-    // Create N segments with random status and optionally with randomized redirect injection
+
+    // Create N segments with random status and optionally with randomized
+    // redirect injection
     for (int i = 1; i < rounds - 1; i++) {
       // Status, 6 possibilities incremented with 33 hex
-      byte status = (byte)(rnd.nextInt(6) + 0x21);
-      
+      byte status = (byte) (rnd.nextInt(6) + 0x21);
+
       // Whether this is going to be a redirect
       boolean addRedirect = redirect ? rnd.nextBoolean() : false;
       // If it's a redirect we add a datum resulting from a fetch at random,
       // if not: always add a fetch datum to avoid empty segments
       boolean addFetch = addRedirect ? rnd.nextBoolean() : true;
-      
+
       createSegment(segmentPaths[i], status, addFetch, addRedirect);
     }
 
     // Create the last segment according to the specified status
     // (additionally, add a redirect at random)
-    createSegment(segmentPaths[rounds - 1], lastStatus, true, redirect ? rnd.nextBoolean() : false);
-    
+    createSegment(segmentPaths[rounds - 1], lastStatus, true,
+        redirect ? rnd.nextBoolean() : false);
+
     // Merge the segments!
     Path mergedSegment = merge(testDir, segmentPaths);
-    
+
     // Check the status of the final record and return it
     return checkMergedSegment(testDir, mergedSegment);
   }
-  
+
   /**
    * Checks the merged segment and removes the stuff again.
-   *
-   * @param the test directory
-   * @param the merged segment
+   * 
+   * @param the
+   *          test directory
+   * @param the
+   *          merged segment
    * @return the final status
    */
-  protected byte checkMergedSegment(Path testDir, Path mergedSegment) throws Exception  {
+  protected byte checkMergedSegment(Path testDir, Path mergedSegment)
+      throws Exception {
     // Get a MapFile reader for the <Text,CrawlDatum> pairs
-    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf);
-    
+    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
+        mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf);
+
     Text key = new Text();
     CrawlDatum value = new CrawlDatum();
     byte finalStatus = 0x0;
-    
+
     for (MapFile.Reader reader : readers) {
       while (reader.next(key, value)) {
-        LOG.info("Reading status for: " + key.toString() + " > " + CrawlDatum.getStatusName(value.getStatus()));
-        
+        LOG.info("Reading status for: " + key.toString() + " > "
+            + CrawlDatum.getStatusName(value.getStatus()));
+
         // Only consider fetch status
-        if (CrawlDatum.hasFetchStatus(value) && key.toString().equals("http://nutch.apache.org/")) {
+        if (CrawlDatum.hasFetchStatus(value)
+            && key.toString().equals("http://nutch.apache.org/")) {
           finalStatus = value.getStatus();
         }
       }
-      
+
       // Close the reader again
       reader.close();
     }
 
     // Remove the test directory again
     fs.delete(testDir, true);
-    
-    LOG.info("Final fetch status for: http://nutch.apache.org/ > " + CrawlDatum.getStatusName(finalStatus));
+
+    LOG.info("Final fetch status for: http://nutch.apache.org/ > "
+        + CrawlDatum.getStatusName(finalStatus));
 
     // Return the final status
     return finalStatus;
   }
-  
+
   /**
    * Merge some segments!
-   *
-   * @param the test directory
-   * @param the segments to merge
+   * 
+   * @param the
+   *          test directory
+   * @param the
+   *          segments to merge
    * @return Path to the merged segment
    */
   protected Path merge(Path testDir, Path[] segments) throws Exception {
     // Our merged output directory
     Path out = new Path(testDir, "out");
-    
+
     // Merge
     SegmentMerger merger = new SegmentMerger(conf);
     merger.merge(out, segments, false, false, -1);
 
     FileStatus[] stats = fs.listStatus(out);
     Assert.assertEquals(1, stats.length);
-    
+
     return stats[0].getPath();
   }
-  
+
   /**
    * Create a segment with the specified status.
-   *
-   * @param the segment's paths
-   * @param the status of the record, ignored if redirect is true
-   * @param whether we're doing a redirect as well
+   * 
+   * @param the
+   *          segment's paths
+   * @param the
+   *          status of the record, ignored if redirect is true
+   * @param whether
+   *          we're doing a redirect as well
    */
-  protected void createSegment(Path segment, byte status, boolean redirect) throws Exception {
+  protected void createSegment(Path segment, byte status, boolean redirect)
+      throws Exception {
     if (redirect) {
       createSegment(segment, status, false, true);
     } else {
@@ -332,28 +363,32 @@ public class TestSegmentMergerCrawlDatum
     }
   }
 
-  protected void createSegment(Path segment, byte status, boolean fetch, boolean redirect) throws Exception {
+  protected void createSegment(Path segment, byte status, boolean fetch,
+      boolean redirect) throws Exception {
     LOG.info("\nSegment: " + segment.toString());
-    
+
     // The URL of our main record
     String url = "http://nutch.apache.org/";
-    
+
     // The URL of our redirecting URL
     String redirectUrl = "http://nutch.apache.org/i_redirect_to_the_root/";
-    
+
     // Our value
     CrawlDatum value = new CrawlDatum();
-    
+
     // Path of the segment's crawl_fetch directory
-    Path crawlFetchPath = new Path(new Path(segment, CrawlDatum.FETCH_DIR_NAME), "part-00000");
+    Path crawlFetchPath = new Path(
+        new Path(segment, CrawlDatum.FETCH_DIR_NAME), "part-00000");
 
     // Get a writer for map files containing <Text,CrawlDatum> pairs
-    MapFile.Writer writer = new MapFile.Writer(conf, fs, crawlFetchPath.toString(), Text.class, CrawlDatum.class);
+    MapFile.Writer writer = new MapFile.Writer(conf, fs,
+        crawlFetchPath.toString(), Text.class, CrawlDatum.class);
 
     // Whether we're handling a redirect now
     // first add the linked datum
     // - before redirect status because url sorts before redirectUrl
-    // - before fetch status to check whether fetch datum is preferred over linked datum when merging
+    // - before fetch status to check whether fetch datum is preferred over
+    // linked datum when merging
     if (redirect) {
       // We're writing our our main record URL with status linked
       LOG.info(url + " > " + CrawlDatum.getStatusName(CrawlDatum.STATUS_LINKED));
@@ -365,7 +400,7 @@ public class TestSegmentMergerCrawlDatum
     // Whether we're fetching now
     if (fetch) {
       LOG.info(url + " > " + CrawlDatum.getStatusName(status));
-      
+
       // Set the status
       value.setStatus(status);
 
@@ -376,11 +411,12 @@ public class TestSegmentMergerCrawlDatum
     // Whether we're handing a redirect now
     if (redirect) {
       // And the redirect URL with redirect status, pointing to our main URL
-      LOG.info(redirectUrl + " > " + CrawlDatum.getStatusName(CrawlDatum.STATUS_FETCH_REDIR_TEMP));
+      LOG.info(redirectUrl + " > "
+          + CrawlDatum.getStatusName(CrawlDatum.STATUS_FETCH_REDIR_TEMP));
       value.setStatus(CrawlDatum.STATUS_FETCH_REDIR_TEMP);
       writer.append(new Text(redirectUrl), value);
     }
-    
+
     // Close the stuff
     writer.close();
   }

Modified: nutch/trunk/src/test/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java Thu Jan 29 05:38:59 2015
@@ -1,4 +1,5 @@
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -30,16 +31,17 @@ public abstract class AbstractTestbedHan
 
   @Override
   public void handle(String target, HttpServletRequest req,
-          HttpServletResponse res, int dispatch) throws IOException,
-          ServletException {
-    Request base_request = (req instanceof Request) ? (Request)req : HttpConnection.getCurrentConnection().getRequest();
+      HttpServletResponse res, int dispatch) throws IOException,
+      ServletException {
+    Request base_request = (req instanceof Request) ? (Request) req
+        : HttpConnection.getCurrentConnection().getRequest();
     res.addHeader("X-TestbedHandlers", this.getClass().getSimpleName());
     handle(base_request, res, target, dispatch);
   }
-  
-  public abstract void handle(Request req, HttpServletResponse res, String target,
-          int dispatch) throws IOException, ServletException;
-  
+
+  public abstract void handle(Request req, HttpServletResponse res,
+      String target, int dispatch) throws IOException, ServletException;
+
   public void addMyHeader(HttpServletResponse res, String name, String value) {
     name = "X-" + this.getClass().getSimpleName() + "-" + name;
     res.addHeader(name, value);

Modified: nutch/trunk/src/test/org/apache/nutch/tools/proxy/DelayHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/proxy/DelayHandler.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/proxy/DelayHandler.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/proxy/DelayHandler.java Thu Jan 29 05:38:59 2015
@@ -1,4 +1,5 @@
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -25,13 +26,13 @@ import javax.servlet.http.HttpServletRes
 import org.mortbay.jetty.Request;
 
 public class DelayHandler extends AbstractTestbedHandler {
-  
+
   public static final long DEFAULT_DELAY = 2000;
-  
+
   private int delay;
   private boolean random;
   private Random r;
-  
+
   public DelayHandler(int delay) {
     if (delay < 0) {
       delay = -delay;
@@ -43,13 +44,13 @@ public class DelayHandler extends Abstra
 
   @Override
   public void handle(Request req, HttpServletResponse res, String target,
-          int dispatch) throws IOException, ServletException {
+      int dispatch) throws IOException, ServletException {
     try {
       int del = random ? r.nextInt(delay) : delay;
       Thread.sleep(del);
       addMyHeader(res, "Delay", String.valueOf(del));
     } catch (Exception e) {
-      
+
     }
   }
 }

Modified: nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java Thu Jan 29 05:38:59 2015
@@ -1,4 +1,5 @@
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -29,22 +30,20 @@ import org.mortbay.jetty.Request;
 public class FakeHandler extends AbstractTestbedHandler {
   Random r = new Random(1234567890L); // predictable
 
-  private static final String testA = 
-    "<html><body><h1>Internet Weather Forecast Accuracy</h1>\n" + 
-    "<p>Weather forecasting is a secure and popular online presence, which is understandable. The weather affects most everyone's life, and the Internet can provide information on just about any location at any hour of the day or night. But how accurate is this information? How much can we trust it? Perhaps it is just my skeptical nature (or maybe the seeming unpredictability of nature), but I've never put much weight into weather forecasts - especially those made more than three days in advance. That skepticism progressed to a new high in the Summer of 2004, but I have only now done the research necessary to test the accuracy of online weather forecasts. First the story, then the data.</p>" +
-    "<h2>An Internet Weather Forecast Gone Terribly Awry</h2>" +
-    "<p>It was the Summer of 2004 and my wife and I were gearing up for a trip with another couple to Schlitterbahn in New Braunfels - one of the (if not the) best waterparks ever created. As a matter of course when embarking on a 2.5-hour drive to spend the day in a swimsuit, and given the tendency of the area for natural disasters, we checked the weather. The temperatures looked ideal and, most importantly, the chance of rain was a nice round goose egg.</p>";
-  private static final String testB =
-    "<p>A couple of hours into our Schlitterbahn experience, we got on a bus to leave the 'old section' for the 'new section.' Along the way, clouds gathered and multiple claps of thunder sounded. 'So much for the 0% chance of rain,' I commented. By the time we got to our destination, lightning sightings had led to the slides and pools being evacuated and soon the rain began coming down in torrents - accompanied by voluminous lightning flashes. After at least a half an hour the downpour had subsided, but the lightning showed no sign of letting up, so we began heading back to our vehicles. A hundred yards into the parking lot, we passing a tree that had apparently been split in two during the storm (whether by lightning or wind, I'm not sure). Not but a few yards later, there was a distinct thud and the husband of the couple accompanying us cried out as a near racquetball sized hunk of ice rebounded off of his head and onto the concrete. Soon, similarly sized hail was falling all aro
 und us as everyone scampered for cover. Some cowered under overturned trashcans while others were more fortunate and made it indoors.</p>" +
-    "<p>The hail, rain and lightning eventually subsided, but the most alarming news was waiting on cell phone voicemail. A friend who lived in the area had called frantically, knowing we were at the park, as the local news was reporting multiple people had been by struck by lightning at Schlitterbahn during the storm.</p>" +
-    "<p>'So much for the 0% chance of rain,' I repeated.</p></body></html>";
+  private static final String testA = "<html><body><h1>Internet Weather Forecast Accuracy</h1>\n"
+      + "<p>Weather forecasting is a secure and popular online presence, which is understandable. The weather affects most everyone's life, and the Internet can provide information on just about any location at any hour of the day or night. But how accurate is this information? How much can we trust it? Perhaps it is just my skeptical nature (or maybe the seeming unpredictability of nature), but I've never put much weight into weather forecasts - especially those made more than three days in advance. That skepticism progressed to a new high in the Summer of 2004, but I have only now done the research necessary to test the accuracy of online weather forecasts. First the story, then the data.</p>"
+      + "<h2>An Internet Weather Forecast Gone Terribly Awry</h2>"
+      + "<p>It was the Summer of 2004 and my wife and I were gearing up for a trip with another couple to Schlitterbahn in New Braunfels - one of the (if not the) best waterparks ever created. As a matter of course when embarking on a 2.5-hour drive to spend the day in a swimsuit, and given the tendency of the area for natural disasters, we checked the weather. The temperatures looked ideal and, most importantly, the chance of rain was a nice round goose egg.</p>";
+  private static final String testB = "<p>A couple of hours into our Schlitterbahn experience, we got on a bus to leave the 'old section' for the 'new section.' Along the way, clouds gathered and multiple claps of thunder sounded. 'So much for the 0% chance of rain,' I commented. By the time we got to our destination, lightning sightings had led to the slides and pools being evacuated and soon the rain began coming down in torrents - accompanied by voluminous lightning flashes. After at least a half an hour the downpour had subsided, but the lightning showed no sign of letting up, so we began heading back to our vehicles. A hundred yards into the parking lot, we passing a tree that had apparently been split in two during the storm (whether by lightning or wind, I'm not sure). Not but a few yards later, there was a distinct thud and the husband of the couple accompanying us cried out as a near racquetball sized hunk of ice rebounded off of his head and onto the concrete. Soon, simila
 rly sized hail was falling all around us as everyone scampered for cover. Some cowered under overturned trashcans while others were more fortunate and made it indoors.</p>"
+      + "<p>The hail, rain and lightning eventually subsided, but the most alarming news was waiting on cell phone voicemail. A friend who lived in the area had called frantically, knowing we were at the park, as the local news was reporting multiple people had been by struck by lightning at Schlitterbahn during the storm.</p>"
+      + "<p>'So much for the 0% chance of rain,' I repeated.</p></body></html>";
 
   @Override
-  public void handle(Request req, HttpServletResponse res, String target, 
-          int dispatch) throws IOException, ServletException {
+  public void handle(Request req, HttpServletResponse res, String target,
+      int dispatch) throws IOException, ServletException {
     HttpURI u = req.getUri();
     String uri = u.toString();
-    //System.err.println("-faking " + uri.toString());
+    // System.err.println("-faking " + uri.toString());
     addMyHeader(res, "URI", uri);
     // don't pass it down the chain
     req.setHandled(true);
@@ -68,8 +67,10 @@ public class FakeHandler extends Abstrac
         base = u.getPath();
       }
       String prefix = u.getScheme() + "://" + u.getHost();
-      if (u.getPort() != 80 && u.getPort() != -1) base += ":" + u.getPort();
-      if (!base.startsWith("/")) prefix += "/";
+      if (u.getPort() != 80 && u.getPort() != -1)
+        base += ":" + u.getPort();
+      if (!base.startsWith("/"))
+        prefix += "/";
       prefix = prefix + base;
       for (int i = 0; i < 10; i++) {
         String link = "<p><a href='" + prefix;
@@ -82,18 +83,20 @@ public class FakeHandler extends Abstrac
       // fake a few links to random nonexistent hosts
       for (int i = 0; i < 5; i++) {
         int h = r.nextInt(1000000); // 1 mln hosts
-        String link = "<p><a href='http://www.fake-" + h + ".com/'>fake host " + h + "</a></p>\r\n";
+        String link = "<p><a href='http://www.fake-" + h + ".com/'>fake host "
+            + h + "</a></p>\r\n";
         os.write(link.getBytes());
       }
       // fake a link to the root URL
       String link = "<p><a href='" + u.getScheme() + "://" + u.getHost();
-      if (u.getPort() != 80 && u.getPort() != -1) link += ":" + u.getPort();
+      if (u.getPort() != 80 && u.getPort() != -1)
+        link += ":" + u.getPort();
       link += "/'>site " + u.getHost() + "</a></p>\r\n";
       os.write(link.getBytes());
       os.write(testB.getBytes());
       res.flushBuffer();
     } catch (IOException ioe) {
-    }    
+    }
   }
 
 }

Modified: nutch/trunk/src/test/org/apache/nutch/tools/proxy/LogDebugHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/proxy/LogDebugHandler.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/proxy/LogDebugHandler.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/proxy/LogDebugHandler.java Thu Jan 29 05:38:59 2015
@@ -1,4 +1,5 @@
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -31,29 +32,33 @@ import org.slf4j.LoggerFactory;
 import org.mortbay.jetty.Request;
 
 public class LogDebugHandler extends AbstractTestbedHandler implements Filter {
-  private static final Logger LOG = LoggerFactory.getLogger(LogDebugHandler.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(LogDebugHandler.class);
 
   @Override
   public void handle(Request req, HttpServletResponse res, String target,
-          int dispatch) throws IOException, ServletException {
-    LOG.info("-- " + req.getMethod() + " " + req.getUri().toString() + "\n" + req.getConnection().getRequestFields());
+      int dispatch) throws IOException, ServletException {
+    LOG.info("-- " + req.getMethod() + " " + req.getUri().toString() + "\n"
+        + req.getConnection().getRequestFields());
   }
 
   @Override
   public void doFilter(ServletRequest req, ServletResponse res,
-          FilterChain chain) throws IOException, ServletException {
-    ((HttpServletResponse)res).addHeader("X-Handled-By", "AsyncProxyHandler");
-    ((HttpServletResponse)res).addHeader("X-TestbedHandlers", "AsyncProxyHandler");
+      FilterChain chain) throws IOException, ServletException {
+    ((HttpServletResponse) res).addHeader("X-Handled-By", "AsyncProxyHandler");
+    ((HttpServletResponse) res).addHeader("X-TestbedHandlers",
+        "AsyncProxyHandler");
     try {
       chain.doFilter(req, res);
     } catch (Throwable e) {
-      ((HttpServletResponse)res).sendError(HttpServletResponse.SC_BAD_REQUEST, e.toString());
+      ((HttpServletResponse) res).sendError(HttpServletResponse.SC_BAD_REQUEST,
+          e.toString());
     }
   }
 
   @Override
   public void init(FilterConfig arg0) throws ServletException {
     // TODO Auto-generated method stub
-    
+
   }
 }

Modified: nutch/trunk/src/test/org/apache/nutch/tools/proxy/NotFoundHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/proxy/NotFoundHandler.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/proxy/NotFoundHandler.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/proxy/NotFoundHandler.java Thu Jan 29 05:38:59 2015
@@ -1,4 +1,5 @@
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -27,13 +28,13 @@ public class NotFoundHandler extends Abs
 
   @Override
   public void handle(Request req, HttpServletResponse res, String target,
-          int dispatch) throws IOException, ServletException {
+      int dispatch) throws IOException, ServletException {
     // don't pass it down the chain
     req.setHandled(true);
     res.addHeader("X-Handled-By", getClass().getSimpleName());
     addMyHeader(res, "URI", req.getUri().toString());
-    res.sendError(HttpServletResponse.SC_NOT_FOUND, "Not found: " +
-            req.getUri().toString());
+    res.sendError(HttpServletResponse.SC_NOT_FOUND, "Not found: "
+        + req.getUri().toString());
   }
 
 }

Modified: nutch/trunk/src/test/org/apache/nutch/tools/proxy/ProxyTestbed.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/proxy/ProxyTestbed.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/proxy/ProxyTestbed.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/proxy/ProxyTestbed.java Thu Jan 29 05:38:59 2015
@@ -1,4 +1,5 @@
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -44,17 +45,25 @@ public class ProxyTestbed {
    */
   public static void main(String[] args) throws Exception {
     if (args.length == 0) {
-      System.err.println("TestbedProxy [-seg <segment_name> | -segdir <segments>] [-port <nnn>] [-forward] [-fake] [-delay nnn] [-debug]");
-      System.err.println("-seg <segment_name>\tpath to a single segment (can be specified multiple times)");
-      System.err.println("-segdir <segments>\tpath to a parent directory of multiple segments (as above)");
-      System.err.println("-port <nnn>\trun the proxy on port <nnn> (special permissions may be needed for ports < 1024)");
-      System.err.println("-forward\tif specified, requests to all unknown urls will be passed to");
-      System.err.println("\t\toriginal servers. If false (default) unknown urls generate 404 Not Found.");
-      System.err.println("-delay\tdelay every response by nnn seconds. If delay is negative use a random value up to nnn");
-      System.err.println("-fake\tif specified, requests to all unknown urls will succeed with fake content");
+      System.err
+          .println("TestbedProxy [-seg <segment_name> | -segdir <segments>] [-port <nnn>] [-forward] [-fake] [-delay nnn] [-debug]");
+      System.err
+          .println("-seg <segment_name>\tpath to a single segment (can be specified multiple times)");
+      System.err
+          .println("-segdir <segments>\tpath to a parent directory of multiple segments (as above)");
+      System.err
+          .println("-port <nnn>\trun the proxy on port <nnn> (special permissions may be needed for ports < 1024)");
+      System.err
+          .println("-forward\tif specified, requests to all unknown urls will be passed to");
+      System.err
+          .println("\t\toriginal servers. If false (default) unknown urls generate 404 Not Found.");
+      System.err
+          .println("-delay\tdelay every response by nnn seconds. If delay is negative use a random value up to nnn");
+      System.err
+          .println("-fake\tif specified, requests to all unknown urls will succeed with fake content");
       System.exit(-1);
     }
-    
+
     Configuration conf = NutchConfiguration.create();
     int port = conf.getInt("segment.proxy.port", 8181);
     boolean forward = false;
@@ -62,7 +71,7 @@ public class ProxyTestbed {
     boolean delay = false;
     boolean debug = false;
     int delayVal = 0;
-    
+
     HashSet<Path> segs = new HashSet<Path>();
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-segdir")) {
@@ -88,28 +97,30 @@ public class ProxyTestbed {
         System.exit(-1);
       }
     }
-    
+
     // Create the server
     Server server = new Server();
     SocketConnector connector = new SocketConnector();
     connector.setPort(port);
     connector.setResolveNames(false);
     server.addConnector(connector);
-    
+
     // create a list of handlers
     HandlerList list = new HandlerList();
     server.addHandler(list);
-    
+
     if (debug) {
       LOG.info("* Added debug handler.");
       list.addHandler(new LogDebugHandler());
     }
- 
+
     if (delay) {
-      LOG.info("* Added delay handler: " + (delayVal < 0 ? "random delay up to " + (-delayVal) : "constant delay of " + delayVal));
+      LOG.info("* Added delay handler: "
+          + (delayVal < 0 ? "random delay up to " + (-delayVal)
+              : "constant delay of " + delayVal));
       list.addHandler(new DelayHandler(delayVal));
     }
-    
+
     // XXX alternatively, we can add the DispatchHandler as the first one,
     // XXX to activate handler plugins and redirect requests to appropriate
     // XXX handlers ... Here we always load these handlers
@@ -122,7 +133,8 @@ public class ProxyTestbed {
         list.addHandler(segment);
         LOG.info("* Added segment handler for: " + p);
       } catch (Exception e) {
-        LOG.warn("Skipping segment '" + p + "': " + StringUtils.stringifyException(e));
+        LOG.warn("Skipping segment '" + p + "': "
+            + StringUtils.stringifyException(e));
       }
     }
     if (forward) {

Modified: nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java Thu Jan 29 05:38:59 2015
@@ -1,4 +1,5 @@
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -50,43 +51,55 @@ import org.mortbay.jetty.Request;
  * XXX should turn this into a plugin?
  */
 public class SegmentHandler extends AbstractTestbedHandler {
-  private static final Logger LOG = LoggerFactory.getLogger(SegmentHandler.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(SegmentHandler.class);
   private Segment seg;
-  
-  private static HashMap<Integer,Integer> protoCodes = new HashMap<Integer,Integer>();
-  
+
+  private static HashMap<Integer, Integer> protoCodes = new HashMap<Integer, Integer>();
+
   static {
-    protoCodes.put(ProtocolStatus.ACCESS_DENIED, HttpServletResponse.SC_UNAUTHORIZED);
-    protoCodes.put(ProtocolStatus.BLOCKED, HttpServletResponse.SC_SERVICE_UNAVAILABLE);
-    protoCodes.put(ProtocolStatus.EXCEPTION, HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
+    protoCodes.put(ProtocolStatus.ACCESS_DENIED,
+        HttpServletResponse.SC_UNAUTHORIZED);
+    protoCodes.put(ProtocolStatus.BLOCKED,
+        HttpServletResponse.SC_SERVICE_UNAVAILABLE);
+    protoCodes.put(ProtocolStatus.EXCEPTION,
+        HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
     protoCodes.put(ProtocolStatus.FAILED, HttpServletResponse.SC_BAD_REQUEST);
     protoCodes.put(ProtocolStatus.GONE, HttpServletResponse.SC_GONE);
-    protoCodes.put(ProtocolStatus.MOVED, HttpServletResponse.SC_MOVED_PERMANENTLY);
-    protoCodes.put(ProtocolStatus.NOTFETCHING, HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.MOVED,
+        HttpServletResponse.SC_MOVED_PERMANENTLY);
+    protoCodes.put(ProtocolStatus.NOTFETCHING,
+        HttpServletResponse.SC_BAD_REQUEST);
     protoCodes.put(ProtocolStatus.NOTFOUND, HttpServletResponse.SC_NOT_FOUND);
-    protoCodes.put(ProtocolStatus.NOTMODIFIED, HttpServletResponse.SC_NOT_MODIFIED);
-    protoCodes.put(ProtocolStatus.PROTO_NOT_FOUND, HttpServletResponse.SC_BAD_REQUEST);
-    protoCodes.put(ProtocolStatus.REDIR_EXCEEDED, HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.NOTMODIFIED,
+        HttpServletResponse.SC_NOT_MODIFIED);
+    protoCodes.put(ProtocolStatus.PROTO_NOT_FOUND,
+        HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.REDIR_EXCEEDED,
+        HttpServletResponse.SC_BAD_REQUEST);
     protoCodes.put(ProtocolStatus.RETRY, HttpServletResponse.SC_BAD_REQUEST);
-    protoCodes.put(ProtocolStatus.ROBOTS_DENIED, HttpServletResponse.SC_FORBIDDEN);
+    protoCodes.put(ProtocolStatus.ROBOTS_DENIED,
+        HttpServletResponse.SC_FORBIDDEN);
     protoCodes.put(ProtocolStatus.SUCCESS, HttpServletResponse.SC_OK);
-    protoCodes.put(ProtocolStatus.TEMP_MOVED, HttpServletResponse.SC_MOVED_TEMPORARILY);
-    protoCodes.put(ProtocolStatus.WOULDBLOCK, HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.TEMP_MOVED,
+        HttpServletResponse.SC_MOVED_TEMPORARILY);
+    protoCodes.put(ProtocolStatus.WOULDBLOCK,
+        HttpServletResponse.SC_BAD_REQUEST);
   }
-  
+
   private static class SegmentPathFilter implements PathFilter {
     public static final SegmentPathFilter INSTANCE = new SegmentPathFilter();
-    
+
     @Override
     public boolean accept(Path p) {
       return p.getName().startsWith("part-");
     }
-    
+
   }
-  
+
   private static class Segment implements Closeable {
-    
-    private static final Partitioner<Text,Writable> PARTITIONER = new HashPartitioner<Text,Writable>();
+
+    private static final Partitioner<Text, Writable> PARTITIONER = new HashPartitioner<Text, Writable>();
 
     private Path segmentDir;
 
@@ -98,7 +111,8 @@ public class SegmentHandler extends Abst
     private MapFile.Reader[] crawl;
     private Configuration conf;
 
-    public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException {
+    public Segment(FileSystem fs, Path segmentDir, Configuration conf)
+        throws IOException {
       this.segmentDir = segmentDir;
       this.conf = conf;
     }
@@ -108,43 +122,52 @@ public class SegmentHandler extends Abst
         if (crawl == null)
           crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
       }
-      return (CrawlDatum)getEntry(crawl, url, new CrawlDatum());
+      return (CrawlDatum) getEntry(crawl, url, new CrawlDatum());
     }
-    
+
     public Content getContent(Text url) throws IOException {
       synchronized (cLock) {
         if (content == null)
           content = getReaders(Content.DIR_NAME);
       }
-      return (Content)getEntry(content, url, new Content());
+      return (Content) getEntry(content, url, new Content());
     }
 
     /** Open the output generated by this format. */
     private MapFile.Reader[] getReaders(String subDir) throws IOException {
       Path dir = new Path(segmentDir, subDir);
       FileSystem fs = dir.getFileSystem(conf);
-      Path[] names = FileUtil.stat2Paths(fs.listStatus(dir, SegmentPathFilter.INSTANCE));
+      Path[] names = FileUtil.stat2Paths(fs.listStatus(dir,
+          SegmentPathFilter.INSTANCE));
 
       // sort names, so that hash partitioning works
       Arrays.sort(names);
-      
+
       MapFile.Reader[] parts = new MapFile.Reader[names.length];
       for (int i = 0; i < names.length; i++) {
         parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
       }
       return parts;
     }
-    
-    private Writable getEntry(MapFile.Reader[] readers, Text url,
-                              Writable entry) throws IOException {
+
+    private Writable getEntry(MapFile.Reader[] readers, Text url, Writable entry)
+        throws IOException {
       return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
     }
 
     public void close() throws IOException {
-      if (content != null) { closeReaders(content); }
-      if (parseText != null) { closeReaders(parseText); }
-      if (parseData != null) { closeReaders(parseData); }
-      if (crawl != null) { closeReaders(crawl); }
+      if (content != null) {
+        closeReaders(content);
+      }
+      if (parseText != null) {
+        closeReaders(parseText);
+      }
+      if (parseData != null) {
+        closeReaders(parseData);
+      }
+      if (crawl != null) {
+        closeReaders(crawl);
+      }
     }
 
     private void closeReaders(MapFile.Reader[] readers) throws IOException {
@@ -154,14 +177,14 @@ public class SegmentHandler extends Abst
     }
 
   }
-  
+
   public SegmentHandler(Configuration conf, Path name) throws Exception {
     seg = new Segment(FileSystem.get(conf), name, conf);
   }
 
   @Override
   public void handle(Request req, HttpServletResponse res, String target,
-          int dispatch) throws IOException, ServletException {
+      int dispatch) throws IOException, ServletException {
     try {
       String uri = req.getUri().toString();
       LOG.info("URI: " + uri);
@@ -171,17 +194,18 @@ public class SegmentHandler extends Abst
       if (cd != null) {
         addMyHeader(res, "Res", "found");
         LOG.info("-got " + cd.toString());
-        ProtocolStatus ps = (ProtocolStatus)cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
+        ProtocolStatus ps = (ProtocolStatus) cd.getMetaData().get(
+            Nutch.WRITABLE_PROTO_STATUS_KEY);
         if (ps != null) {
           Integer TrCode = protoCodes.get(ps.getCode());
           if (TrCode != null) {
-            res.setStatus(TrCode.intValue());            
+            res.setStatus(TrCode.intValue());
           } else {
             res.setStatus(HttpServletResponse.SC_OK);
           }
           addMyHeader(res, "ProtocolStatus", ps.toString());
         } else {
-          res.setStatus(HttpServletResponse.SC_OK);          
+          res.setStatus(HttpServletResponse.SC_OK);
         }
         Content c = seg.getContent(url);
         if (c == null) { // missing content

Modified: nutch/trunk/src/test/org/apache/nutch/tools/proxy/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/proxy/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/proxy/package-info.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/proxy/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
  * Proxy to {@link org.apache.nutch.tools.Benchmark benchmark} the crawler.
  */
 package org.apache.nutch.tools.proxy;
+

Modified: nutch/trunk/src/test/org/apache/nutch/util/TestGZIPUtils.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestGZIPUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestGZIPUtils.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestGZIPUtils.java Thu Jan 29 05:38:59 2015
@@ -26,221 +26,215 @@ import org.junit.Test;
 public class TestGZIPUtils {
 
   /* a short, highly compressable, string */
-  String SHORT_TEST_STRING= 
-      "aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbcccccccccccccccc";
+  String SHORT_TEST_STRING = "aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbcccccccccccccccc";
 
   /* a short, highly compressable, string */
-  String LONGER_TEST_STRING= 
-      SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING 
-      + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING 
-      + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING 
-      + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING;
+  String LONGER_TEST_STRING = SHORT_TEST_STRING + SHORT_TEST_STRING
+      + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING
+      + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING
+      + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING
+      + SHORT_TEST_STRING;
 
   /* a snapshot of the nutch webpage */
-  String WEBPAGE= 
-      "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n"
-          + "<html>\n"
-          + "<head>\n"
-          + "  <meta http-equiv=\"content-type\"\n"
-          + " content=\"text/html; charset=ISO-8859-1\">\n"
-          + "  <title>Nutch</title>\n"
-          + "</head>\n"
-          + "<body>\n"
-          + "<h1\n"
-          + " style=\"font-family: helvetica,arial,sans-serif; text-align: center; color: rgb(255, 153, 0);\"><a\n"
-          + " href=\"http://www.nutch.org/\"><font style=\"color: rgb(255, 153, 0);\">Nutch</font></a><br>\n"
-          + "<small>an open source web-search engine</small></h1>\n"
-          + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\">\n"
-          + "<table\n"
-          + " style=\"width: 100%; text-align: left; margin-left: auto; margin-right: auto;\"\n"
-          + " border=\"0\" cellspacing=\"0\" cellpadding=\"0\">\n"
-          + "  <tbody>\n"
-          + "    <tr>\n"
-          + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
-          + " href=\"http://sourceforge.net/project/showfiles.php?group_id=59548\">Download</a><br>\n"
-          + "      </td>\n"
-          + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
-          + " href=\"tutorial.html\">Tutorial</a><br>\n"
-          + "      </td>\n"
-          + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
-          + " href=\"http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/nutch/nutch/\">CVS</a><br>\n"
-          + "      </td>\n"
-          + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
-          + " href=\"api/index.html\">Javadoc</a><br>\n"
-          + "      </td>\n"
-          + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
-          + " href=\"http://sourceforge.net/tracker/?atid=491356&amp;group_id=59548&amp;func=browse\">Bugs</a><br>\n"
-          + "      </td>\n"
-          + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
-          + " href=\"http://sourceforge.net/mail/?group_id=59548\">Lists</a></td>\n"
-          + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
-          + " href=\"policies.html\">Policies</a><br>\n"
-          + "      </td>\n"
-          + "    </tr>\n"
-          + "  </tbody>\n"
-          + "</table>\n"
-          + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\">\n"
-          + "<h2>Introduction</h2>\n"
-          + "Nutch is a nascent effort to implement an open-source web search\n"
-          + "engine. Web search is a basic requirement for internet navigation, yet\n"
-          + "the number of web search engines is decreasing. Today's oligopoly could\n"
-          + "soon be a monopoly, with a single company controlling nearly all web\n"
-          + "search for its commercial gain. &nbsp;That would not be good for the\n"
-          + "users of internet. &nbsp;Nutch aims to enable anyone to easily and\n"
-          + "cost-effectively deploy a world-class web search engine.<br>\n"
-          + "<br>\n"
-          + "To succeed, the Nutch software must be able to:<br>\n"
-          + "<ul>\n"
-          + "  <li> crawl several billion pages per month</li>\n"
-          + "  <li>maintain an index of these pages</li>\n"
-          + "  <li>search that index up to 1000 times per second</li>\n"
-          + "  <li>provide very high quality search results</li>\n"
-          + "  <li>operate at minimal cost</li>\n"
-          + "</ul>\n"
-          + "<h2>Status</h2>\n"
-          + "Currently we're just a handful of developers working part-time to put\n"
-          + "together a demo. &nbsp;The demo is coded entirely in Java. &nbsp;However\n"
-          + "persistent data is written in well-documented formats so that modules\n"
-          + "may eventually be re-written in other languages (e.g., Perl, C++) as the\n"
-          + "project progresses.<br>\n"
-          + "<br>\n"
-          + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\"> <a\n"
-          + " href=\"http://sourceforge.net\"> </a>\n"
-          + "<div style=\"text-align: center;\"><a href=\"http://sourceforge.net\"><img\n"
-          + " src=\"http://sourceforge.net/sflogo.php?group_id=59548&amp;type=1\"\n"
-          + " style=\"border: 0px solid ; width: 88px; height: 31px;\"\n"
-          + " alt=\"SourceForge.net Logo\" title=\"\"></a></div>\n"
-          + "</body>\n"
-          + "</html>\n";
+  String WEBPAGE = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n"
+      + "<html>\n"
+      + "<head>\n"
+      + "  <meta http-equiv=\"content-type\"\n"
+      + " content=\"text/html; charset=ISO-8859-1\">\n"
+      + "  <title>Nutch</title>\n"
+      + "</head>\n"
+      + "<body>\n"
+      + "<h1\n"
+      + " style=\"font-family: helvetica,arial,sans-serif; text-align: center; color: rgb(255, 153, 0);\"><a\n"
+      + " href=\"http://www.nutch.org/\"><font style=\"color: rgb(255, 153, 0);\">Nutch</font></a><br>\n"
+      + "<small>an open source web-search engine</small></h1>\n"
+      + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\">\n"
+      + "<table\n"
+      + " style=\"width: 100%; text-align: left; margin-left: auto; margin-right: auto;\"\n"
+      + " border=\"0\" cellspacing=\"0\" cellpadding=\"0\">\n"
+      + "  <tbody>\n"
+      + "    <tr>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"http://sourceforge.net/project/showfiles.php?group_id=59548\">Download</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"tutorial.html\">Tutorial</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/nutch/nutch/\">CVS</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"api/index.html\">Javadoc</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"http://sourceforge.net/tracker/?atid=491356&amp;group_id=59548&amp;func=browse\">Bugs</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"http://sourceforge.net/mail/?group_id=59548\">Lists</a></td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"policies.html\">Policies</a><br>\n"
+      + "      </td>\n"
+      + "    </tr>\n"
+      + "  </tbody>\n"
+      + "</table>\n"
+      + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\">\n"
+      + "<h2>Introduction</h2>\n"
+      + "Nutch is a nascent effort to implement an open-source web search\n"
+      + "engine. Web search is a basic requirement for internet navigation, yet\n"
+      + "the number of web search engines is decreasing. Today's oligopoly could\n"
+      + "soon be a monopoly, with a single company controlling nearly all web\n"
+      + "search for its commercial gain. &nbsp;That would not be good for the\n"
+      + "users of internet. &nbsp;Nutch aims to enable anyone to easily and\n"
+      + "cost-effectively deploy a world-class web search engine.<br>\n"
+      + "<br>\n"
+      + "To succeed, the Nutch software must be able to:<br>\n"
+      + "<ul>\n"
+      + "  <li> crawl several billion pages per month</li>\n"
+      + "  <li>maintain an index of these pages</li>\n"
+      + "  <li>search that index up to 1000 times per second</li>\n"
+      + "  <li>provide very high quality search results</li>\n"
+      + "  <li>operate at minimal cost</li>\n"
+      + "</ul>\n"
+      + "<h2>Status</h2>\n"
+      + "Currently we're just a handful of developers working part-time to put\n"
+      + "together a demo. &nbsp;The demo is coded entirely in Java. &nbsp;However\n"
+      + "persistent data is written in well-documented formats so that modules\n"
+      + "may eventually be re-written in other languages (e.g., Perl, C++) as the\n"
+      + "project progresses.<br>\n"
+      + "<br>\n"
+      + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\"> <a\n"
+      + " href=\"http://sourceforge.net\"> </a>\n"
+      + "<div style=\"text-align: center;\"><a href=\"http://sourceforge.net\"><img\n"
+      + " src=\"http://sourceforge.net/sflogo.php?group_id=59548&amp;type=1\"\n"
+      + " style=\"border: 0px solid ; width: 88px; height: 31px;\"\n"
+      + " alt=\"SourceForge.net Logo\" title=\"\"></a></div>\n"
+      + "</body>\n"
+      + "</html>\n";
 
   @Test
   public void testZipUnzip() {
-    byte[] testBytes= SHORT_TEST_STRING.getBytes();
+    byte[] testBytes = SHORT_TEST_STRING.getBytes();
     testZipUnzip(testBytes);
-    testBytes= LONGER_TEST_STRING.getBytes();
+    testBytes = LONGER_TEST_STRING.getBytes();
     testZipUnzip(testBytes);
-    testBytes= WEBPAGE.getBytes();
+    testBytes = WEBPAGE.getBytes();
     testZipUnzip(testBytes);
   }
 
   @Test
   public void testZipUnzipBestEffort() {
-    byte[] testBytes= SHORT_TEST_STRING.getBytes();
+    byte[] testBytes = SHORT_TEST_STRING.getBytes();
     testZipUnzipBestEffort(testBytes);
-    testBytes= LONGER_TEST_STRING.getBytes();
+    testBytes = LONGER_TEST_STRING.getBytes();
     testZipUnzipBestEffort(testBytes);
-    testBytes= WEBPAGE.getBytes();
+    testBytes = WEBPAGE.getBytes();
     testZipUnzipBestEffort(testBytes);
   }
 
   public void testTruncation() {
-    byte[] testBytes= SHORT_TEST_STRING.getBytes();
+    byte[] testBytes = SHORT_TEST_STRING.getBytes();
     testTruncation(testBytes);
-    testBytes= LONGER_TEST_STRING.getBytes();
+    testBytes = LONGER_TEST_STRING.getBytes();
     testTruncation(testBytes);
-    testBytes= WEBPAGE.getBytes();
+    testBytes = WEBPAGE.getBytes();
     testTruncation(testBytes);
   }
 
   @Test
   public void testLimit() {
-    byte[] testBytes= SHORT_TEST_STRING.getBytes();
+    byte[] testBytes = SHORT_TEST_STRING.getBytes();
     testLimit(testBytes);
-    testBytes= LONGER_TEST_STRING.getBytes();
+    testBytes = LONGER_TEST_STRING.getBytes();
     testLimit(testBytes);
-    testBytes= WEBPAGE.getBytes();
+    testBytes = WEBPAGE.getBytes();
     testLimit(testBytes);
   }
 
   // helpers
 
   public void testZipUnzip(byte[] origBytes) {
-    byte[] compressedBytes= GZIPUtils.zip(origBytes);
+    byte[] compressedBytes = GZIPUtils.zip(origBytes);
 
     Assert.assertTrue("compressed array is not smaller!",
         compressedBytes.length < origBytes.length);
 
-    byte[] uncompressedBytes= null;
+    byte[] uncompressedBytes = null;
     try {
-      uncompressedBytes= GZIPUtils.unzip(compressedBytes);
+      uncompressedBytes = GZIPUtils.unzip(compressedBytes);
     } catch (IOException e) {
       e.printStackTrace();
-      Assert.assertTrue("caught exception '" + e + "' during unzip()",
-          false);
+      Assert.assertTrue("caught exception '" + e + "' during unzip()", false);
     }
-    Assert.assertTrue("uncompressedBytes is wrong size", 
+    Assert.assertTrue("uncompressedBytes is wrong size",
         uncompressedBytes.length == origBytes.length);
 
-    for (int i= 0; i < origBytes.length; i++) 
+    for (int i = 0; i < origBytes.length; i++)
       if (origBytes[i] != uncompressedBytes[i])
         Assert.assertTrue("uncompressedBytes does not match origBytes", false);
   }
 
   public void testZipUnzipBestEffort(byte[] origBytes) {
-    byte[] compressedBytes= GZIPUtils.zip(origBytes);
+    byte[] compressedBytes = GZIPUtils.zip(origBytes);
 
     Assert.assertTrue("compressed array is not smaller!",
         compressedBytes.length < origBytes.length);
 
-    byte[] uncompressedBytes= GZIPUtils.unzipBestEffort(compressedBytes);
-    Assert.assertTrue("uncompressedBytes is wrong size", 
+    byte[] uncompressedBytes = GZIPUtils.unzipBestEffort(compressedBytes);
+    Assert.assertTrue("uncompressedBytes is wrong size",
         uncompressedBytes.length == origBytes.length);
 
-    for (int i= 0; i < origBytes.length; i++) 
+    for (int i = 0; i < origBytes.length; i++)
       if (origBytes[i] != uncompressedBytes[i])
         Assert.assertTrue("uncompressedBytes does not match origBytes", false);
   }
 
   public void testTruncation(byte[] origBytes) {
-    byte[] compressedBytes= GZIPUtils.zip(origBytes);
+    byte[] compressedBytes = GZIPUtils.zip(origBytes);
 
     System.out.println("original data has len " + origBytes.length);
-    System.out.println("compressed data has len " 
-        + compressedBytes.length);
+    System.out.println("compressed data has len " + compressedBytes.length);
 
-    for (int i= compressedBytes.length; i >= 0; i--) {
+    for (int i = compressedBytes.length; i >= 0; i--) {
 
-      byte[] truncCompressed= new byte[i];
+      byte[] truncCompressed = new byte[i];
 
-      for (int j= 0; j < i; j++)
-        truncCompressed[j]= compressedBytes[j];
+      for (int j = 0; j < i; j++)
+        truncCompressed[j] = compressedBytes[j];
 
-      byte[] trunc= GZIPUtils.unzipBestEffort(truncCompressed);
+      byte[] trunc = GZIPUtils.unzipBestEffort(truncCompressed);
 
       if (trunc == null) {
-        System.out.println("truncated to len "
-            + i + ", trunc is null");
+        System.out.println("truncated to len " + i + ", trunc is null");
       } else {
-        System.out.println("truncated to len "
-            + i + ", trunc.length=  " 
+        System.out.println("truncated to len " + i + ", trunc.length=  "
             + trunc.length);
 
-        for (int j= 0; j < trunc.length; j++)
-          if (trunc[j] != origBytes[j]) 
+        for (int j = 0; j < trunc.length; j++)
+          if (trunc[j] != origBytes[j])
             Assert.assertTrue("truncated/uncompressed array differs at pos "
-                + j + " (compressed data had been truncated to len "
-                + i + ")", false);
+                + j + " (compressed data had been truncated to len " + i + ")",
+                false);
       }
     }
   }
 
   public void testLimit(byte[] origBytes) {
-    byte[] compressedBytes= GZIPUtils.zip(origBytes);
+    byte[] compressedBytes = GZIPUtils.zip(origBytes);
 
     Assert.assertTrue("compressed array is not smaller!",
         compressedBytes.length < origBytes.length);
 
-    for (int i= 0; i < origBytes.length; i++) {
+    for (int i = 0; i < origBytes.length; i++) {
 
-      byte[] uncompressedBytes= 
-          GZIPUtils.unzipBestEffort(compressedBytes, i);
+      byte[] uncompressedBytes = GZIPUtils.unzipBestEffort(compressedBytes, i);
 
-      Assert.assertTrue("uncompressedBytes is wrong size", 
+      Assert.assertTrue("uncompressedBytes is wrong size",
           uncompressedBytes.length == i);
 
-      for (int j= 0; j < i; j++) 
+      for (int j = 0; j < i; j++)
         if (origBytes[j] != uncompressedBytes[j])
-          Assert.assertTrue("uncompressedBytes does not match origBytes", false);
+          Assert
+              .assertTrue("uncompressedBytes does not match origBytes", false);
     }
   }
 

Modified: nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java Thu Jan 29 05:38:59 2015
@@ -36,7 +36,8 @@ public class TestMimeUtil extends TestCa
   private File sampleDir = new File(System.getProperty("test.build.data", "."),
       "test-mime-util");
 
-  /** test data, every element on "test page":
+  /**
+   * test data, every element on "test page":
    * <ol>
    * <li>MIME type</li>
    * <li>file name (last URL path element)</li>
@@ -67,15 +68,11 @@ public class TestMimeUtil extends TestCa
           "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
               + "<html>\n<head>\n"
               + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
-              + "</head>\n<body>Hello, World!</body></html>" }
-    };
+              + "</head>\n<body>Hello, World!</body></html>" } };
 
-  public static String[][] binaryFiles = {
-    {
+  public static String[][] binaryFiles = { {
       "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-      "test.xlsx",
-      "" }
-    };
+      "test.xlsx", "" } };
 
   private String getMimeType(String url, File file, String contentType,
       boolean useMagic) throws IOException {
@@ -121,8 +118,8 @@ public class TestMimeUtil extends TestCa
   public void testBinaryFiles() throws IOException {
     for (String[] testPage : binaryFiles) {
       File dataFile = new File(sampleDir, testPage[1]);
-      String mimeType = getMimeType(urlPrefix + testPage[1],
-          dataFile, testPage[2], false);
+      String mimeType = getMimeType(urlPrefix + testPage[1], dataFile,
+          testPage[2], false);
       assertEquals("", testPage[0], mimeType);
     }
   }

Modified: nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java Thu Jan 29 05:38:59 2015
@@ -30,41 +30,40 @@ import org.xml.sax.InputSource;
 public class TestNodeWalker {
 
   /* a snapshot of the nutch webpage */
-  private final static String WEBPAGE= 
-  "<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\" xml:lang=\"en\"><head><title>Nutch</title></head>"
-  + "<body>"
-  + "<ul>"
-  + "<li>crawl several billion pages per month</li>"
-  + "<li>maintain an index of these pages</li>"
-  + "<li>search that index up to 1000 times per second</li>"
-  + "<li>provide very high quality search results</li>"
-  + "<li>operate at minimal cost</li>"
-  + "</ul>"
-  + "</body>"
-  + "</html>";
+  private final static String WEBPAGE = "<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\" xml:lang=\"en\"><head><title>Nutch</title></head>"
+      + "<body>"
+      + "<ul>"
+      + "<li>crawl several billion pages per month</li>"
+      + "<li>maintain an index of these pages</li>"
+      + "<li>search that index up to 1000 times per second</li>"
+      + "<li>provide very high quality search results</li>"
+      + "<li>operate at minimal cost</li>" + "</ul>" + "</body>" + "</html>";
 
   private final static String[] ULCONTENT = new String[4];
-  
+
   @Before
-  public void setUp() throws Exception{
-    ULCONTENT[0]="crawl several billion pages per month" ;
-    ULCONTENT[1]="maintain an index of these pages" ;
-    ULCONTENT[2]="search that index up to 1000 times per second"  ;
-    ULCONTENT[3]="operate at minimal cost" ;
+  public void setUp() throws Exception {
+    ULCONTENT[0] = "crawl several billion pages per month";
+    ULCONTENT[1] = "maintain an index of these pages";
+    ULCONTENT[2] = "search that index up to 1000 times per second";
+    ULCONTENT[3] = "operate at minimal cost";
   }
 
   @Test
   public void testSkipChildren() {
-    DOMParser parser= new DOMParser();
-    
+    DOMParser parser = new DOMParser();
+
     try {
       parser.setFeature("http://xml.org/sax/features/validation", false);
-      parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
-      parser.parse(new InputSource(new ByteArrayInputStream(WEBPAGE.getBytes())));
+      parser.setFeature(
+          "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+          false);
+      parser
+          .parse(new InputSource(new ByteArrayInputStream(WEBPAGE.getBytes())));
     } catch (Exception e) {
       e.printStackTrace();
     }
-     
+
     StringBuffer sb = new StringBuffer();
     NodeWalker walker = new NodeWalker(parser.getDocument());
     while (walker.hasNext()) {
@@ -76,30 +75,33 @@ public class TestNodeWalker {
         sb.append(text);
       }
     }
-    Assert.assertTrue("UL Content can NOT be found in the node", findSomeUlContent(sb.toString()));
-     
-   StringBuffer sbSkip = new StringBuffer();
-   NodeWalker walkerSkip = new NodeWalker(parser.getDocument());
-   while (walkerSkip.hasNext()) {
-     Node currentNode = walkerSkip.nextNode();
-     String nodeName = currentNode.getNodeName();
-     short nodeType = currentNode.getNodeType();
-     if ("ul".equalsIgnoreCase(nodeName)) {
-       walkerSkip.skipChildren();
-     }
-     if (nodeType == Node.TEXT_NODE) {
-       String text = currentNode.getNodeValue();
-       text = text.replaceAll("\\s+", " ");
-       sbSkip.append(text);
-     }
-   }
-   Assert.assertFalse("UL Content can be found in the node", findSomeUlContent(sbSkip.toString()));
+    Assert.assertTrue("UL Content can NOT be found in the node",
+        findSomeUlContent(sb.toString()));
+
+    StringBuffer sbSkip = new StringBuffer();
+    NodeWalker walkerSkip = new NodeWalker(parser.getDocument());
+    while (walkerSkip.hasNext()) {
+      Node currentNode = walkerSkip.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      if ("ul".equalsIgnoreCase(nodeName)) {
+        walkerSkip.skipChildren();
+      }
+      if (nodeType == Node.TEXT_NODE) {
+        String text = currentNode.getNodeValue();
+        text = text.replaceAll("\\s+", " ");
+        sbSkip.append(text);
+      }
+    }
+    Assert.assertFalse("UL Content can be found in the node",
+        findSomeUlContent(sbSkip.toString()));
   }
-  
+
   public boolean findSomeUlContent(String str) {
-    for(int i=0; i<ULCONTENT.length ; i++){
-      if(str.contains(ULCONTENT[i])) return true;
-    }    
+    for (int i = 0; i < ULCONTENT.length; i++) {
+      if (str.contains(ULCONTENT[i]))
+        return true;
+    }
     return false;
   }
 }

Modified: nutch/trunk/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java Thu Jan 29 05:38:59 2015
@@ -23,99 +23,93 @@ import org.junit.Test;
 /** Unit tests for PrefixStringMatcher. */
 public class TestPrefixStringMatcher {
 
-
-  private final static int NUM_TEST_ROUNDS= 20;
-  private final static int MAX_TEST_PREFIXES= 100;
-  private final static int MAX_PREFIX_LEN= 10;
-  private final static int NUM_TEST_INPUTS_PER_ROUND= 100;
-  private final static int MAX_INPUT_LEN= 20;
-
-  private final static char[] alphabet= 
-    new char[] {
-      'a', 'b', 'c', 'd',
-//      'e', 'f', 'g', 'h', 'i', 'j',
-//      'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
-//      'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4',
-//      '5', '6', '7', '8', '9', '0'
-    };
+  private final static int NUM_TEST_ROUNDS = 20;
+  private final static int MAX_TEST_PREFIXES = 100;
+  private final static int MAX_PREFIX_LEN = 10;
+  private final static int NUM_TEST_INPUTS_PER_ROUND = 100;
+  private final static int MAX_INPUT_LEN = 20;
+
+  private final static char[] alphabet = new char[] { 'a', 'b', 'c', 'd',
+  // 'e', 'f', 'g', 'h', 'i', 'j',
+  // 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
+  // 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4',
+  // '5', '6', '7', '8', '9', '0'
+  };
 
   private String makeRandString(int minLen, int maxLen) {
-    int len= minLen + (int) (Math.random() * (maxLen - minLen));
-    char[] chars= new char[len];
-    
-    for (int pos= 0; pos < len; pos++) {
-      chars[pos]= alphabet[(int) (Math.random() * alphabet.length)];
+    int len = minLen + (int) (Math.random() * (maxLen - minLen));
+    char[] chars = new char[len];
+
+    for (int pos = 0; pos < len; pos++) {
+      chars[pos] = alphabet[(int) (Math.random() * alphabet.length)];
     }
-    
+
     return new String(chars);
   }
-  
+
   @Test
   public void testPrefixMatcher() {
-    int numMatches= 0;
-    int numInputsTested= 0;
+    int numMatches = 0;
+    int numInputsTested = 0;
 
-    for (int round= 0; round < NUM_TEST_ROUNDS; round++) {
+    for (int round = 0; round < NUM_TEST_ROUNDS; round++) {
 
       // build list of prefixes
-      int numPrefixes= (int) (Math.random() * MAX_TEST_PREFIXES);
-      String[] prefixes= new String[numPrefixes];
-      for (int i= 0; i < numPrefixes; i++) {
-        prefixes[i]= makeRandString(0, MAX_PREFIX_LEN);
+      int numPrefixes = (int) (Math.random() * MAX_TEST_PREFIXES);
+      String[] prefixes = new String[numPrefixes];
+      for (int i = 0; i < numPrefixes; i++) {
+        prefixes[i] = makeRandString(0, MAX_PREFIX_LEN);
       }
 
-      PrefixStringMatcher prematcher= new PrefixStringMatcher(prefixes);
+      PrefixStringMatcher prematcher = new PrefixStringMatcher(prefixes);
 
       // test random strings for prefix matches
-      for (int i= 0; i < NUM_TEST_INPUTS_PER_ROUND; i++) {
-        String input= makeRandString(0, MAX_INPUT_LEN);
-        boolean matches= false;
-        int longestMatch= -1;
-        int shortestMatch= -1;
-
-        for (int j= 0; j < prefixes.length; j++) {
-
-          if ((prefixes[j].length() > 0) 
-              && input.startsWith(prefixes[j])) {
-
-            matches= true;
-            int matchSize= prefixes[j].length();
-
-            if (matchSize > longestMatch) 
-              longestMatch= matchSize;
-
-            if ( (matchSize < shortestMatch)
-                 || (shortestMatch == -1) )
-              shortestMatch= matchSize;
+      for (int i = 0; i < NUM_TEST_INPUTS_PER_ROUND; i++) {
+        String input = makeRandString(0, MAX_INPUT_LEN);
+        boolean matches = false;
+        int longestMatch = -1;
+        int shortestMatch = -1;
+
+        for (int j = 0; j < prefixes.length; j++) {
+
+          if ((prefixes[j].length() > 0) && input.startsWith(prefixes[j])) {
+
+            matches = true;
+            int matchSize = prefixes[j].length();
+
+            if (matchSize > longestMatch)
+              longestMatch = matchSize;
+
+            if ((matchSize < shortestMatch) || (shortestMatch == -1))
+              shortestMatch = matchSize;
           }
 
         }
 
-        if (matches) 
+        if (matches)
           numMatches++;
 
         numInputsTested++;
 
-        Assert.assertTrue( "'" + input + "' should " + (matches ? "" : "not ") 
-                    + "match!",
-                    matches == prematcher.matches(input) );
+        Assert.assertTrue("'" + input + "' should " + (matches ? "" : "not ")
+            + "match!", matches == prematcher.matches(input));
         if (matches) {
-          Assert.assertTrue( shortestMatch 
-                      == prematcher.shortestMatch(input).length());
-          Assert.assertTrue( input.substring(0, shortestMatch).equals(
-                        prematcher.shortestMatch(input)) );
-
-          Assert.assertTrue( longestMatch 
-                      == prematcher.longestMatch(input).length());
-          Assert.assertTrue( input.substring(0, longestMatch).equals(
-                        prematcher.longestMatch(input)) );
+          Assert.assertTrue(shortestMatch == prematcher.shortestMatch(input)
+              .length());
+          Assert.assertTrue(input.substring(0, shortestMatch).equals(
+              prematcher.shortestMatch(input)));
+
+          Assert.assertTrue(longestMatch == prematcher.longestMatch(input)
+              .length());
+          Assert.assertTrue(input.substring(0, longestMatch).equals(
+              prematcher.longestMatch(input)));
 
         }
       }
     }
 
-    System.out.println("got " + numMatches + " matches out of " 
-                       + numInputsTested + " tests");
+    System.out.println("got " + numMatches + " matches out of "
+        + numInputsTested + " tests");
   }
 
 }

Modified: nutch/trunk/src/test/org/apache/nutch/util/TestStringUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestStringUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestStringUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestStringUtil.java Thu Jan 29 05:38:59 2015
@@ -24,37 +24,37 @@ import org.junit.Test;
 public class TestStringUtil {
 
   public void testRightPad() {
-    String s= "my string";
+    String s = "my string";
 
-    String ps= StringUtil.rightPad(s, 0);
+    String ps = StringUtil.rightPad(s, 0);
     Assert.assertTrue(s.equals(ps));
 
-    ps= StringUtil.rightPad(s, 9);
+    ps = StringUtil.rightPad(s, 9);
     Assert.assertTrue(s.equals(ps));
 
-    ps= StringUtil.rightPad(s, 10);
-    Assert.assertTrue( (s+" ").equals(ps) );
+    ps = StringUtil.rightPad(s, 10);
+    Assert.assertTrue((s + " ").equals(ps));
 
-    ps= StringUtil.rightPad(s, 15);
-    Assert.assertTrue( (s+"      ").equals(ps) );
+    ps = StringUtil.rightPad(s, 15);
+    Assert.assertTrue((s + "      ").equals(ps));
 
   }
 
   @Test
   public void testLeftPad() {
-    String s= "my string";
+    String s = "my string";
 
-    String ps= StringUtil.leftPad(s, 0);
+    String ps = StringUtil.leftPad(s, 0);
     Assert.assertTrue(s.equals(ps));
 
-    ps= StringUtil.leftPad(s, 9);
+    ps = StringUtil.leftPad(s, 9);
     Assert.assertTrue(s.equals(ps));
 
-    ps= StringUtil.leftPad(s, 10);
-    Assert.assertTrue( (" "+s).equals(ps) );
+    ps = StringUtil.leftPad(s, 10);
+    Assert.assertTrue((" " + s).equals(ps));
 
-    ps= StringUtil.leftPad(s, 15);
-    Assert.assertTrue( ("      "+s).equals(ps) );
+    ps = StringUtil.leftPad(s, 15);
+    Assert.assertTrue(("      " + s).equals(ps));
 
   }
 

Modified: nutch/trunk/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java Thu Jan 29 05:38:59 2015
@@ -23,97 +23,92 @@ import org.junit.Test;
 /** Unit tests for SuffixStringMatcher. */
 public class TestSuffixStringMatcher {
 
-  private final static int NUM_TEST_ROUNDS= 20;
-  private final static int MAX_TEST_SUFFIXES= 100;
-  private final static int MAX_SUFFIX_LEN= 10;
-  private final static int NUM_TEST_INPUTS_PER_ROUND= 100;
-  private final static int MAX_INPUT_LEN= 20;
-
-  private final static char[] alphabet= 
-    new char[] {
-      'a', 'b', 'c', 'd',
-//      'e', 'f', 'g', 'h', 'i', 'j',
-//      'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
-//      'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4',
-//      '5', '6', '7', '8', '9', '0'
-    };
+  private final static int NUM_TEST_ROUNDS = 20;
+  private final static int MAX_TEST_SUFFIXES = 100;
+  private final static int MAX_SUFFIX_LEN = 10;
+  private final static int NUM_TEST_INPUTS_PER_ROUND = 100;
+  private final static int MAX_INPUT_LEN = 20;
+
+  private final static char[] alphabet = new char[] { 'a', 'b', 'c', 'd',
+  // 'e', 'f', 'g', 'h', 'i', 'j',
+  // 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
+  // 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4',
+  // '5', '6', '7', '8', '9', '0'
+  };
 
   private String makeRandString(int minLen, int maxLen) {
-    int len= minLen + (int) (Math.random() * (maxLen - minLen));
-    char[] chars= new char[len];
-    
-    for (int pos= 0; pos < len; pos++) {
-      chars[pos]= alphabet[(int) (Math.random() * alphabet.length)];
+    int len = minLen + (int) (Math.random() * (maxLen - minLen));
+    char[] chars = new char[len];
+
+    for (int pos = 0; pos < len; pos++) {
+      chars[pos] = alphabet[(int) (Math.random() * alphabet.length)];
     }
-    
+
     return new String(chars);
   }
-  
+
   @Test
   public void testSuffixMatcher() {
-    int numMatches= 0;
-    int numInputsTested= 0;
+    int numMatches = 0;
+    int numInputsTested = 0;
 
-    for (int round= 0; round < NUM_TEST_ROUNDS; round++) {
+    for (int round = 0; round < NUM_TEST_ROUNDS; round++) {
 
       // build list of suffixes
-      int numSuffixes= (int) (Math.random() * MAX_TEST_SUFFIXES);
-      String[] suffixes= new String[numSuffixes];
-      for (int i= 0; i < numSuffixes; i++) {
-        suffixes[i]= makeRandString(0, MAX_SUFFIX_LEN);
+      int numSuffixes = (int) (Math.random() * MAX_TEST_SUFFIXES);
+      String[] suffixes = new String[numSuffixes];
+      for (int i = 0; i < numSuffixes; i++) {
+        suffixes[i] = makeRandString(0, MAX_SUFFIX_LEN);
       }
 
-      SuffixStringMatcher sufmatcher= new SuffixStringMatcher(suffixes);
+      SuffixStringMatcher sufmatcher = new SuffixStringMatcher(suffixes);
 
       // test random strings for suffix matches
-      for (int i= 0; i < NUM_TEST_INPUTS_PER_ROUND; i++) {
-        String input= makeRandString(0, MAX_INPUT_LEN);
-        boolean matches= false;
-        int longestMatch= -1;
-        int shortestMatch= -1;
-
-        for (int j= 0; j < suffixes.length; j++) {
-
-          if ((suffixes[j].length() > 0) 
-              && input.endsWith(suffixes[j])) {
-
-            matches= true;
-            int matchSize= suffixes[j].length();
-
-            if (matchSize > longestMatch) 
-              longestMatch= matchSize;
-
-            if ( (matchSize < shortestMatch)
-                 || (shortestMatch == -1) )
-              shortestMatch= matchSize;
+      for (int i = 0; i < NUM_TEST_INPUTS_PER_ROUND; i++) {
+        String input = makeRandString(0, MAX_INPUT_LEN);
+        boolean matches = false;
+        int longestMatch = -1;
+        int shortestMatch = -1;
+
+        for (int j = 0; j < suffixes.length; j++) {
+
+          if ((suffixes[j].length() > 0) && input.endsWith(suffixes[j])) {
+
+            matches = true;
+            int matchSize = suffixes[j].length();
+
+            if (matchSize > longestMatch)
+              longestMatch = matchSize;
+
+            if ((matchSize < shortestMatch) || (shortestMatch == -1))
+              shortestMatch = matchSize;
           }
 
         }
 
-        if (matches) 
+        if (matches)
           numMatches++;
 
         numInputsTested++;
 
-        Assert.assertTrue( "'" + input + "' should " + (matches ? "" : "not ") 
-                    + "match!",
-                    matches == sufmatcher.matches(input) );
+        Assert.assertTrue("'" + input + "' should " + (matches ? "" : "not ")
+            + "match!", matches == sufmatcher.matches(input));
         if (matches) {
-          Assert.assertTrue( shortestMatch 
-                      == sufmatcher.shortestMatch(input).length());
-          Assert.assertTrue( input.substring(input.length() - shortestMatch).equals(
-                        sufmatcher.shortestMatch(input)) );
-
-          Assert.assertTrue( longestMatch 
-                      == sufmatcher.longestMatch(input).length());
-          Assert.assertTrue( input.substring(input.length() - longestMatch).equals(
-                        sufmatcher.longestMatch(input)) );
+          Assert.assertTrue(shortestMatch == sufmatcher.shortestMatch(input)
+              .length());
+          Assert.assertTrue(input.substring(input.length() - shortestMatch)
+              .equals(sufmatcher.shortestMatch(input)));
+
+          Assert.assertTrue(longestMatch == sufmatcher.longestMatch(input)
+              .length());
+          Assert.assertTrue(input.substring(input.length() - longestMatch)
+              .equals(sufmatcher.longestMatch(input)));
         }
       }
     }
 
-    System.out.println("got " + numMatches + " matches out of " 
-                       + numInputsTested + " tests");
+    System.out.println("got " + numMatches + " matches out of "
+        + numInputsTested + " tests");
   }
 
 }