You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/09 07:34:37 UTC
svn commit: r1650447 [10/25] - in /nutch/branches/2.x: ./ src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/ src/java/org/apache/nutch/api/impl/db/ src/java/org/apache/nutch/api/model/response/ src/java/org/apache/nutch/api/resources/ s...

Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/WebTableCreator.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/WebTableCreator.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/WebTableCreator.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/WebTableCreator.java Fri Jan  9 06:34:33 2015
@@ -21,9 +21,8 @@ import org.apache.gora.store.DataStore;
 
 public class WebTableCreator {
   public static void main(String[] args) throws Exception {
-    DataStore<String, WebPage> store =
-      StorageUtils.createWebStore(NutchConfiguration.create(), String.class,
-        WebPage.class);
+    DataStore<String, WebPage> store = StorageUtils.createWebStore(
+        NutchConfiguration.create(), String.class, WebPage.class);
 
     System.out.println(store);
   }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/package-info.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/package-info.java Fri Jan  9 06:34:33 2015
@@ -20,3 +20,4 @@
  * {@link org.apache.nutch.storage.Host host metadata}) of data in abstracted storage.
  */
 package org.apache.nutch.storage;
+

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java Fri Jan  9 06:34:33 2015
@@ -50,7 +50,8 @@ public class Benchmark extends Configure
     System.exit(res);
   }
 
-  private void createSeeds(FileSystem fs, Path seedsDir, int count) throws Exception {
+  private void createSeeds(FileSystem fs, Path seedsDir, int count)
+      throws Exception {
     OutputStream os = fs.create(new Path(seedsDir, "seeds"));
     for (int i = 0; i < count; i++) {
       String url = "http://www.test-" + i + ".com/\r\n";
@@ -61,7 +62,7 @@ public class Benchmark extends Configure
   }
 
   public static final class BenchmarkResults {
-    Map<String,Map<String,Long>> timings = new HashMap<String,Map<String,Long>>();
+    Map<String, Map<String, Long>> timings = new HashMap<String, Map<String, Long>>();
     List<String> runs = new ArrayList<String>();
     List<String> stages = new ArrayList<String>();
     int seeds, depth, threads;
@@ -76,9 +77,9 @@ public class Benchmark extends Configure
       if (!stages.contains(stage)) {
         stages.add(stage);
       }
-      Map<String,Long> t = timings.get(stage);
+      Map<String, Long> t = timings.get(stage);
       if (t == null) {
-        t = new HashMap<String,Long>();
+        t = new HashMap<String, Long>();
         timings.put(stage, t);
       }
       t.put(run, timing);
@@ -94,8 +95,9 @@ public class Benchmark extends Configure
       sb.append("* TopN:\t" + topN + "\n");
       sb.append("* TOTAL ELAPSED:\t" + elapsed + "\n");
       for (String stage : stages) {
-        Map<String,Long> timing = timings.get(stage);
-        if (timing == null) continue;
+        Map<String, Long> timing = timings.get(stage);
+        if (timing == null)
+          continue;
         sb.append("- stage: " + stage + "\n");
         for (String r : runs) {
           Long Time = timing.get(r);
@@ -111,6 +113,7 @@ public class Benchmark extends Configure
     public List<String> getStages() {
       return stages;
     }
+
     public List<String> getRuns() {
       return runs;
     }
@@ -121,21 +124,28 @@ public class Benchmark extends Configure
     int seeds = 1;
     int depth = 10;
     int threads = 10;
-    //boolean delete = true;
+    // boolean delete = true;
     long topN = Long.MAX_VALUE;
 
     if (args.length == 0) {
-      System.err.println("Usage: Benchmark [-crawlId <id>] [-seeds NN] [-depth NN] [-threads NN] [-maxPerHost NN] [-plugins <regex>]");
-      System.err.println("\t-crawlId id\t the id to prefix the schemas to operate on, (default: storage.crawl.id)");
-      System.err.println("\t-seeds NN\tcreate NN unique hosts in a seed list (default: 1)");
+      System.err
+          .println("Usage: Benchmark [-crawlId <id>] [-seeds NN] [-depth NN] [-threads NN] [-maxPerHost NN] [-plugins <regex>]");
+      System.err
+          .println("\t-crawlId id\t the id to prefix the schemas to operate on, (default: storage.crawl.id)");
+      System.err
+          .println("\t-seeds NN\tcreate NN unique hosts in a seed list (default: 1)");
       System.err.println("\t-depth NN\tperform NN crawl cycles (default: 10)");
-      System.err.println("\t-threads NN\tuse NN threads per Fetcher task (default: 10)");
+      System.err
+          .println("\t-threads NN\tuse NN threads per Fetcher task (default: 10)");
       // XXX what is the equivalent here? not an additional job...
       // System.err.println("\t-keep\tkeep batchId data (default: delete after updatedb)");
       System.err.println("\t-plugins <regex>\toverride 'plugin.includes'.");
-      System.err.println("\tNOTE: if not specified, this is reset to: " + plugins);
-      System.err.println("\tNOTE: if 'default' is specified then a value set in nutch-default/nutch-site is used.");
-      System.err.println("\t-maxPerHost NN\tmax. # of URLs per host in a fetchlist");
+      System.err.println("\tNOTE: if not specified, this is reset to: "
+          + plugins);
+      System.err
+          .println("\tNOTE: if 'default' is specified then a value set in nutch-default/nutch-site is used.");
+      System.err
+          .println("\t-maxPerHost NN\tmax. # of URLs per host in a fetchlist");
       return -1;
     }
     int maxPerHost = Integer.MAX_VALUE;
@@ -157,13 +167,14 @@ public class Benchmark extends Configure
         return -1;
       }
     }
-    BenchmarkResults res = benchmark(seeds, depth, threads, maxPerHost, topN, plugins);
+    BenchmarkResults res = benchmark(seeds, depth, threads, maxPerHost, topN,
+        plugins);
     System.out.println(res);
     return 0;
   }
 
-  public BenchmarkResults benchmark(int seeds, int depth, int threads, int maxPerHost,
-        long topN, String plugins) throws Exception {
+  public BenchmarkResults benchmark(int seeds, int depth, int threads,
+      int maxPerHost, long topN, String plugins) throws Exception {
     Configuration conf = getConf();
     conf.set("http.proxy.host", "localhost");
     conf.setInt("http.proxy.port", 8181);
@@ -173,11 +184,12 @@ public class Benchmark extends Configure
       conf.set("plugin.includes", plugins);
     }
     conf.setInt(GeneratorJob.GENERATOR_MAX_COUNT, maxPerHost);
-    conf.set(GeneratorJob.GENERATOR_COUNT_MODE, GeneratorJob.GENERATOR_COUNT_VALUE_HOST);
+    conf.set(GeneratorJob.GENERATOR_COUNT_MODE,
+        GeneratorJob.GENERATOR_COUNT_VALUE_HOST);
     Job job = new NutchJob(conf);
     FileSystem fs = FileSystem.get(job.getConfiguration());
-    Path dir = new Path(getConf().get("hadoop.tmp.dir"),
-            "bench-" + System.currentTimeMillis());
+    Path dir = new Path(getConf().get("hadoop.tmp.dir"), "bench-"
+        + System.currentTimeMillis());
     fs.mkdirs(dir);
     Path rootUrlDir = new Path(dir, "seed");
     fs.mkdirs(rootUrlDir);
@@ -204,7 +216,7 @@ public class Benchmark extends Configure
     ParserJob parseBatch = new ParserJob(conf);
     DbUpdaterJob crawlDbTool = new DbUpdaterJob(conf);
     // not needed in the new API
-    //LinkDb linkDbTool = new LinkDb(getConf());
+    // LinkDb linkDbTool = new LinkDb(getConf());
 
     long start = System.currentTimeMillis();
     // initialize crawlDb
@@ -212,10 +224,10 @@ public class Benchmark extends Configure
     long delta = System.currentTimeMillis() - start;
     res.addTiming("inject", "0", delta);
     int i;
-    for (i = 0; i < depth; i++) {             // generate new batch
+    for (i = 0; i < depth; i++) { // generate new batch
       start = System.currentTimeMillis();
       String batchId = generator.generate(topN, System.currentTimeMillis(),
-              false, false);
+          false, false);
       delta = System.currentTimeMillis() - start;
       res.addTiming("generate", i + "", delta);
       if (batchId == null) {
@@ -224,12 +236,12 @@ public class Benchmark extends Configure
       }
       boolean isParsing = getConf().getBoolean("fetcher.parse", false);
       start = System.currentTimeMillis();
-      fetcher.fetch(batchId, threads, false, -1);  // fetch it
+      fetcher.fetch(batchId, threads, false, -1); // fetch it
       delta = System.currentTimeMillis() - start;
       res.addTiming("fetch", i + "", delta);
       if (!isParsing) {
         start = System.currentTimeMillis();
-        parseBatch.parse(batchId, false, false);    // parse it, if needed
+        parseBatch.parse(batchId, false, false); // parse it, if needed
         delta = System.currentTimeMillis() - start;
         res.addTiming("parse", i + "", delta);
       }
@@ -241,7 +253,9 @@ public class Benchmark extends Configure
     if (i == 0) {
       LOG.warn("No URLs to fetch - check your seed list and URL filters.");
     }
-    if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
+    if (LOG.isInfoEnabled()) {
+      LOG.info("crawl finished: " + dir);
+    }
     res.elapsed = System.currentTimeMillis() - res.elapsed;
     WebTableReader dbreader = new WebTableReader();
     dbreader.setConf(conf);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java Fri Jan  9 06:34:33 2015
@@ -40,17 +40,16 @@ import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.TableUtil;
 
-
 /** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */
 public class DmozParser {
   public static final Logger LOG = LoggerFactory.getLogger(DmozParser.class);
-  
-    long pages = 0;
-    private static DataStore<String, WebPage> store = null;
-    
+
+  long pages = 0;
+  private static DataStore<String, WebPage> store = null;
+
   /**
-   * This filter fixes characters that might offend our parser.
-   * This lets us be tolerant of errors that might appear in the input XML.
+   * This filter fixes characters that might offend our parser. This lets us be
+   * tolerant of errors that might appear in the input XML.
    */
   private static class XMLCharFilter extends FilterReader {
     private boolean lastBad = false;
@@ -62,9 +61,9 @@ public class DmozParser {
     public int read() throws IOException {
       int c = in.read();
       int value = c;
-      if (c != -1 && !(XMLChar.isValid(c)))     // fix invalid characters
+      if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters
         value = 'X';
-      else if (lastBad && c == '<') {           // fix mis-matched brackets
+      else if (lastBad && c == '<') { // fix mis-matched brackets
         in.mark(1);
         if (in.read() != '/')
           value = 'X';
@@ -75,37 +74,35 @@ public class DmozParser {
       return value;
     }
 
-    public int read(char[] cbuf, int off, int len)
-      throws IOException {
+    public int read(char[] cbuf, int off, int len) throws IOException {
       int n = in.read(cbuf, off, len);
       if (n != -1) {
         for (int i = 0; i < n; i++) {
-          char c = cbuf[off+i];
+          char c = cbuf[off + i];
           char value = c;
-          if (!(XMLChar.isValid(c)))            // fix invalid characters
+          if (!(XMLChar.isValid(c))) // fix invalid characters
             value = 'X';
-          else if (lastBad && c == '<') {       // fix mis-matched brackets
-            if (i != n-1 && cbuf[off+i+1] != '/')
+          else if (lastBad && c == '<') { // fix mis-matched brackets
+            if (i != n - 1 && cbuf[off + i + 1] != '/')
               value = 'X';
           }
           lastBad = (c == 65533);
-          cbuf[off+i] = value;
+          cbuf[off + i] = value;
         }
       }
       return n;
     }
   }
 
-
   /**
-   * The RDFProcessor receives tag messages during a parse
-   * of RDF XML data.  We build whatever structures we need
-   * from these messages.
+   * The RDFProcessor receives tag messages during a parse of RDF XML data. We
+   * build whatever structures we need from these messages.
    */
   private class RDFProcessor extends DefaultHandler {
     String curURL = null, curSection = null;
-    boolean titlePending = false, descPending = false, insideAdultSection = false;
-    Pattern topicPattern = null; 
+    boolean titlePending = false, descPending = false,
+        insideAdultSection = false;
+    Pattern topicPattern = null;
     StringBuffer title = new StringBuffer(), desc = new StringBuffer();
     XMLReader reader;
     int subsetDenom;
@@ -115,16 +112,18 @@ public class DmozParser {
     Locator location;
 
     /**
-     * Pass in an XMLReader, plus a flag as to whether we 
-     * should include adult material.
+     * Pass in an XMLReader, plus a flag as to whether we should include adult
+     * material.
      */
-    public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, int skew, Pattern topicPattern, boolean snippet) throws IOException {
+    public RDFProcessor(XMLReader reader, int subsetDenom,
+        boolean includeAdult, int skew, Pattern topicPattern, boolean snippet)
+        throws IOException {
       this.reader = reader;
       this.subsetDenom = subsetDenom;
       this.includeAdult = includeAdult;
       this.topicPattern = topicPattern;
       this.snippet = snippet;
-      
+
       this.hashSkew = skew != 0 ? skew : new Random().nextInt();
     }
 
@@ -135,20 +134,21 @@ public class DmozParser {
     /**
      * Start of an XML elt
      */
-    public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
+    public void startElement(String namespaceURI, String localName,
+        String qName, Attributes atts) throws SAXException {
       if ("Topic".equals(qName)) {
         curSection = atts.getValue("r:id");
       } else if ("ExternalPage".equals(qName)) {
         // Porn filter
-        if ((! includeAdult) && curSection.startsWith("Top/Adult")) {
+        if ((!includeAdult) && curSection.startsWith("Top/Adult")) {
           return;
         }
-          
+
         if (topicPattern != null && !topicPattern.matcher(curSection).matches()) {
           return;
         }
 
-        // Subset denominator filter.  
+        // Subset denominator filter.
         // Only emit with a chance of 1/denominator.
         String url = atts.getValue("about");
         int hashValue = MD5Hash.digest(url).hashCode();
@@ -181,40 +181,42 @@ public class DmozParser {
      * Termination of XML elt
      */
     public void endElement(String namespaceURI, String localName, String qName)
-      throws SAXException {
+        throws SAXException {
       if (curURL != null) {
         if ("ExternalPage".equals(qName)) {
           //
-          // Inc the number of pages, insert the page, and 
+          // Inc the number of pages, insert the page, and
           // possibly print status.
           //
-          if(snippet){
+          if (snippet) {
             try {
               String reversedUrl = TableUtil.reverseUrl(curURL);
               WebPage row = store.get(reversedUrl);
-              
-              if(row!=null){
+
+              if (row != null) {
                 if (desc.length() > 0) {
-                  row.getMetadata().put(new Utf8("_dmoz_desc_"), ByteBuffer.wrap(desc.toString().getBytes()));
+                  row.getMetadata().put(new Utf8("_dmoz_desc_"),
+                      ByteBuffer.wrap(desc.toString().getBytes()));
                   desc.delete(0, desc.length());
                 }
                 if (title.length() > 0) {
-                  row.getMetadata().put(new Utf8("_dmoz_title_"), ByteBuffer.wrap(title.toString().getBytes()));
+                  row.getMetadata().put(new Utf8("_dmoz_title_"),
+                      ByteBuffer.wrap(title.toString().getBytes()));
                   title.delete(0, title.length());
                 }
                 store.put(reversedUrl, row);
                 store.flush();
               }
-              
-             } catch (IOException e) {
+
+            } catch (IOException e) {
               // TODO Auto-generated catch block
               e.printStackTrace();
-             }
+            }
           } else {
-            System.out.println(curURL); 
-            
+            System.out.println(curURL);
+
             //
-            // Clear out the link text.  This is what
+            // Clear out the link text. This is what
             // you would use for adding to the linkdb.
             //
             if (desc.length() > 0) {
@@ -225,7 +227,7 @@ public class DmozParser {
             }
           }
           pages++;
-          
+
           // Null out the URL.
           curURL = null;
         } else if ("d:Title".equals(qName)) {
@@ -252,15 +254,13 @@ public class DmozParser {
     }
 
     /**
-     * From time to time the Parser will set the "current location"
-     * by calling this function.  It's useful for emitting locations
-     * for error messages.
+     * From time to time the Parser will set the "current location" by calling
+     * this function. It's useful for emitting locations for error messages.
      */
     public void setDocumentLocator(Locator locator) {
       location = locator;
     }
 
-
     //
     // Interface ErrorHandler
     //
@@ -280,11 +280,11 @@ public class DmozParser {
     public void fatalError(SAXParseException spe) {
       if (LOG.isErrorEnabled()) {
         LOG.error("Fatal err: " + spe.toString() + ": " + spe.getMessage());
-        LOG.error("Last known line is " + location.getLineNumber() +
-                  ", column " + location.getColumnNumber());
+        LOG.error("Last known line is " + location.getLineNumber()
+            + ", column " + location.getColumnNumber());
       }
     }
-        
+
     /**
      * Emit exception warning message
      */
@@ -296,35 +296,33 @@ public class DmozParser {
   }
 
   /**
-   * Iterate through all the items in this structured DMOZ file.
-   * Add each URL to the web db.
+   * Iterate through all the items in this structured DMOZ file. Add each URL to
+   * the web db.
    */
   public void parseDmozFile(File dmozFile, int subsetDenom,
-                            boolean includeAdult,
-                            int skew,
-                            Pattern topicPattern,
-                            boolean snippet)
+      boolean includeAdult, int skew, Pattern topicPattern, boolean snippet)
 
-    throws IOException, SAXException, ParserConfigurationException {
+  throws IOException, SAXException, ParserConfigurationException {
 
     SAXParserFactory parserFactory = SAXParserFactory.newInstance();
     SAXParser parser = parserFactory.newSAXParser();
     XMLReader reader = parser.getXMLReader();
 
     // Create our own processor to receive SAX events
-    RDFProcessor rp =
-      new RDFProcessor(reader, subsetDenom, includeAdult,
-                       skew, topicPattern, snippet);
+    RDFProcessor rp = new RDFProcessor(reader, subsetDenom, includeAdult, skew,
+        topicPattern, snippet);
     reader.setContentHandler(rp);
     reader.setErrorHandler(rp);
     LOG.info("skew = " + rp.hashSkew);
 
     //
-    // Open filtered text stream.  The TextFilter makes sure that
+    // Open filtered text stream. The TextFilter makes sure that
     // only appropriate XML-approved Text characters are received.
     // Any non-conforming characters are silently skipped.
     //
-    XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8")));
+    XMLCharFilter in = new XMLCharFilter(new BufferedReader(
+        new InputStreamReader(new BufferedInputStream(new FileInputStream(
+            dmozFile)), "UTF-8")));
     try {
       InputSource is = new InputSource(in);
       reader.parse(is);
@@ -338,18 +336,17 @@ public class DmozParser {
     }
   }
 
-  private static void addTopicsFromFile(String topicFile,
-                                        Vector<String> topics)
-  throws IOException {
+  private static void addTopicsFromFile(String topicFile, Vector<String> topics)
+      throws IOException {
     BufferedReader in = null;
     try {
-      in = new BufferedReader(new InputStreamReader(new FileInputStream(topicFile), "UTF-8"));
+      in = new BufferedReader(new InputStreamReader(new FileInputStream(
+          topicFile), "UTF-8"));
       String line = null;
       while ((line = in.readLine()) != null) {
         topics.addElement(new String(line));
       }
-    } 
-    catch (Exception e) {
+    } catch (Exception e) {
       if (LOG.isErrorEnabled()) {
         LOG.error("Failed with the following exception: ", e.toString());
       }
@@ -358,18 +355,19 @@ public class DmozParser {
       in.close();
     }
   }
-    
+
   /**
-   * Command-line access.  User may add URLs via a flat text file
-   * or the structured DMOZ file.  By default, we ignore Adult
-   * material (as categorized by DMOZ).
+   * Command-line access. User may add URLs via a flat text file or the
+   * structured DMOZ file. By default, we ignore Adult material (as categorized
+   * by DMOZ).
    */
   public static void main(String argv[]) throws Exception {
     if (argv.length < 1) {
-      System.err.println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-snippet] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
+      System.err
+          .println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-snippet] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
       return;
     }
-    
+
     //
     // Parse the command line, figure out what kind of
     // URL file we need to load
@@ -379,29 +377,29 @@ public class DmozParser {
     String dmozFile = argv[0];
     boolean includeAdult = false;
     boolean snippet = false;
-    Pattern topicPattern = null; 
+    Pattern topicPattern = null;
     Vector<String> topics = new Vector<String>();
-    
+
     Configuration conf = NutchConfiguration.create();
-    store = StorageUtils.createWebStore(conf,String.class, WebPage.class);
+    store = StorageUtils.createWebStore(conf, String.class, WebPage.class);
     FileSystem fs = FileSystem.get(conf);
     try {
       for (int i = 1; i < argv.length; i++) {
         if ("-includeAdultMaterial".equals(argv[i])) {
           includeAdult = true;
         } else if ("-subset".equals(argv[i])) {
-          subsetDenom = Integer.parseInt(argv[i+1]);
+          subsetDenom = Integer.parseInt(argv[i + 1]);
           i++;
         } else if ("-topic".equals(argv[i])) {
-          topics.addElement(argv[i+1]); 
+          topics.addElement(argv[i + 1]);
           i++;
         } else if ("-topicFile".equals(argv[i])) {
-          addTopicsFromFile(argv[i+1], topics);
+          addTopicsFromFile(argv[i + 1], topics);
           i++;
         } else if ("-skew".equals(argv[i])) {
-          skew = Integer.parseInt(argv[i+1]);
+          skew = Integer.parseInt(argv[i + 1]);
           i++;
-        }else if ("-snippet".equals(argv[i])) {
+        } else if ("-snippet".equals(argv[i])) {
           snippet = true;
         }
       }
@@ -409,21 +407,21 @@ public class DmozParser {
       DmozParser parser = new DmozParser();
 
       if (!topics.isEmpty()) {
-        String regExp = new String("^("); 
+        String regExp = new String("^(");
         int j = 0;
-        for ( ; j < topics.size() - 1; ++j) {
+        for (; j < topics.size() - 1; ++j) {
           regExp = regExp.concat(topics.get(j));
           regExp = regExp.concat("|");
         }
         regExp = regExp.concat(topics.get(j));
-        regExp = regExp.concat(").*"); 
+        regExp = regExp.concat(").*");
         LOG.info("Topic selection pattern = " + regExp);
-        topicPattern = Pattern.compile(regExp); 
+        topicPattern = Pattern.compile(regExp);
       }
 
-      parser.parseDmozFile(new File(dmozFile), subsetDenom,
-                           includeAdult, skew, topicPattern, snippet);
-      
+      parser.parseDmozFile(new File(dmozFile), subsetDenom, includeAdult, skew,
+          topicPattern, snippet);
+
     } finally {
       fs.close();
     }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/ResolveUrls.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/ResolveUrls.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/ResolveUrls.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/ResolveUrls.java Fri Jan  9 06:34:33 2015
@@ -59,8 +59,7 @@ public class ResolveUrls {
   /**
    * A Thread which gets the ip address of a single host by name.
    */
-  private static class ResolverThread
-    extends Thread {
+  private static class ResolverThread extends Thread {
 
     private String url = null;
 
@@ -74,13 +73,12 @@ public class ResolveUrls {
       String host = URLUtil.getHost(url);
       long start = System.currentTimeMillis();
       try {
-        // get the address by name and if no error is thrown then it 
+        // get the address by name and if no error is thrown then it
         // is resolved successfully
         InetAddress.getByName(host);
         LOG.info("Resolved: " + host);
         numResolved.incrementAndGet();
-      }
-      catch (Exception uhe) {
+      } catch (Exception uhe) {
         LOG.info("Error Resolving: " + host);
         numErrored.incrementAndGet();
       }
@@ -92,8 +90,8 @@ public class ResolveUrls {
   }
 
   /**
-   * Creates a thread pool for resolving urls.  Reads in the url file on the
-   * local filesystem.  For each url it attempts to resolve it keeping a total
+   * Creates a thread pool for resolving urls. Reads in the url file on the
+   * local filesystem. For each url it attempts to resolve it keeping a total
    * account of the number resolved, errored, and the amount of time.
    */
   public void resolveUrls() {
@@ -102,13 +100,13 @@ public class ResolveUrls {
 
       // create a thread pool with a fixed number of threads
       pool = Executors.newFixedThreadPool(numThreads);
-      
+
       // read in the urls file and loop through each line, one url per line
       BufferedReader buffRead = new BufferedReader(new FileReader(new File(
-        urlsFile)));
+          urlsFile)));
       String urlStr = null;
       while ((urlStr = buffRead.readLine()) != null) {
-        
+
         // spin up a resolver thread per url
         LOG.info("Starting: " + urlStr);
         pool.execute(new ResolverThread(urlStr));
@@ -118,9 +116,8 @@ public class ResolveUrls {
       // the thread pool to give urls time to finish resolving
       buffRead.close();
       pool.awaitTermination(60, TimeUnit.SECONDS);
-    }
-    catch (Exception e) {
-      
+    } catch (Exception e) {
+
       // on error shutdown the thread pool immediately
       pool.shutdownNow();
       LOG.info(StringUtils.stringifyException(e));
@@ -128,15 +125,16 @@ public class ResolveUrls {
 
     // shutdown the thread pool and log totals
     pool.shutdown();
-    LOG.info("Total: " + numTotal.get() + ", Resovled: "
-      + numResolved.get() + ", Errored: " + numErrored.get()
-      + ", Average Time: " + totalTime.get() / numTotal.get());
+    LOG.info("Total: " + numTotal.get() + ", Resovled: " + numResolved.get()
+        + ", Errored: " + numErrored.get() + ", Average Time: "
+        + totalTime.get() / numTotal.get());
   }
 
   /**
    * Create a new ResolveUrls with a file from the local file system.
-   *
-   * @param urlsFile The local urls file, one url per line.
+   * 
+   * @param urlsFile
+   *          The local urls file, one url per line.
    */
   public ResolveUrls(String urlsFile) {
     this(urlsFile, 100);
@@ -144,10 +142,12 @@ public class ResolveUrls {
 
   /**
    * Create a new ResolveUrls with a urls file and a number of threads for the
-   * Thread pool.  Number of threads is 100 by default.
+   * Thread pool. Number of threads is 100 by default.
    * 
-   * @param urlsFile The local urls file, one url per line.
-   * @param numThreads The number of threads used to resolve urls in parallel.
+   * @param urlsFile
+   *          The local urls file, one url per line.
+   * @param numThreads
+   *          The number of threads used to resolve urls in parallel.
    */
   public ResolveUrls(String urlsFile, int numThreads) {
     this.urlsFile = urlsFile;
@@ -163,17 +163,17 @@ public class ResolveUrls {
     OptionBuilder.withArgName("help");
     OptionBuilder.withDescription("show this help message");
     Option helpOpts = OptionBuilder.create("help");
-    
+
     OptionBuilder.withArgName("urls");
     OptionBuilder.hasArg();
     OptionBuilder.withDescription("the urls file to check");
     Option urlOpts = OptionBuilder.create("urls");
-    
+
     OptionBuilder.withArgName("numThreads");
     OptionBuilder.hasArgs();
     OptionBuilder.withDescription("the number of threads to use");
     Option numThreadOpts = OptionBuilder.create("numThreads");
-    
+
     options.addOption(helpOpts);
     options.addOption(urlOpts);
     options.addOption(numThreadOpts);
@@ -198,8 +198,7 @@ public class ResolveUrls {
       }
       ResolveUrls resolve = new ResolveUrls(urls, numThreads);
       resolve.resolveUrls();
-    }
-    catch (Exception e) {
+    } catch (Exception e) {
       LOG.error("ResolveUrls: " + StringUtils.stringifyException(e));
     }
   }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java Fri Jan  9 06:34:33 2015
@@ -30,21 +30,22 @@ import org.apache.hadoop.mapred.Reporter
 /**
  * A input format the reads arc files.
  */
-public class ArcInputFormat
-  extends FileInputFormat<Text, BytesWritable> {
+public class ArcInputFormat extends FileInputFormat<Text, BytesWritable> {
 
   /**
    * Returns the <code>RecordReader</code> for reading the arc file.
    * 
-   * @param split The InputSplit of the arc file to process.
-   * @param job The job configuration.
-   * @param reporter The progress reporter.
+   * @param split
+   *          The InputSplit of the arc file to process.
+   * @param job
+   *          The job configuration.
+   * @param reporter
+   *          The progress reporter.
    */
   public RecordReader<Text, BytesWritable> getRecordReader(InputSplit split,
-      JobConf job, Reporter reporter)
-    throws IOException {
+      JobConf job, Reporter reporter) throws IOException {
     reporter.setStatus(split.toString());
-    return new ArcRecordReader(job, (FileSplit)split);
+    return new ArcRecordReader(job, (FileSplit) split);
   }
 
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java Fri Jan  9 06:34:33 2015
@@ -34,23 +34,29 @@ import org.apache.hadoop.util.Reflection
 import org.apache.hadoop.util.StringUtils;
 
 /**
- * <p>The <code>ArchRecordReader</code> class provides a record reader which 
- * reads records from arc files.</p>
+ * <p>
+ * The <code>ArchRecordReader</code> class provides a record reader which reads
+ * records from arc files.
+ * </p>
  * 
- * <p>Arc files are essentially tars of gzips.  Each record in an arc file is
- * a compressed gzip.  Multiple records are concatenated together to form a
- * complete arc.  For more information on the arc file format see
- * {@link http://www.archive.org/web/researcher/ArcFileFormat.php}.</p>
+ * <p>
+ * Arc files are essentially tars of gzips. Each record in an arc file is a
+ * compressed gzip. Multiple records are concatenated together to form a
+ * complete arc. For more information on the arc file format see {@link http
+ * ://www.archive.org/web/researcher/ArcFileFormat.php}.
+ * </p>
  * 
- * <p>Arc files are used by the internet archive and grub projects.</p>
+ * <p>
+ * Arc files are used by the internet archive and grub projects.
+ * </p>
  * 
  * @see http://www.archive.org/
  * @see http://www.grub.org/
  */
-public class ArcRecordReader
-  implements RecordReader<Text, BytesWritable> {
+public class ArcRecordReader implements RecordReader<Text, BytesWritable> {
 
-  public static final Logger LOG = LoggerFactory.getLogger(ArcRecordReader.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ArcRecordReader.class);
 
   protected Configuration conf;
   protected long splitStart = 0;
@@ -60,30 +66,32 @@ public class ArcRecordReader
   protected long fileLen = 0;
   protected FSDataInputStream in;
 
-  private static byte[] MAGIC = {(byte)0x1F, (byte)0x8B};
+  private static byte[] MAGIC = { (byte) 0x1F, (byte) 0x8B };
 
   /**
-   * <p>Returns true if the byte array passed matches the gzip header magic 
-   * number.</p>
+   * <p>
+   * Returns true if the byte array passed matches the gzip header magic number.
+   * </p>
    * 
-   * @param input The byte array to check.
+   * @param input
+   *          The byte array to check.
    * 
    * @return True if the byte array matches the gzip header magic number.
    */
   public static boolean isMagic(byte[] input) {
 
-	// check for null and incorrect length
+    // check for null and incorrect length
     if (input == null || input.length != MAGIC.length) {
       return false;
     }
-    
+
     // check byte by byte
     for (int i = 0; i < MAGIC.length; i++) {
       if (MAGIC[i] != input[i]) {
         return false;
       }
     }
-    
+
     // must match
     return true;
   }
@@ -91,13 +99,16 @@ public class ArcRecordReader
   /**
    * Constructor that sets the configuration and file split.
    * 
-   * @param conf The job configuration.
-   * @param split The file split to read from.
+   * @param conf
+   *          The job configuration.
+   * @param split
+   *          The file split to read from.
    * 
-   * @throws IOException  If an IO error occurs while initializing file split.
+   * @throws IOException
+   *           If an IO error occurs while initializing file split.
    */
   public ArcRecordReader(Configuration conf, FileSplit split)
-    throws IOException {
+      throws IOException {
 
     Path path = split.getPath();
     FileSystem fs = path.getFileSystem(conf);
@@ -113,8 +124,7 @@ public class ArcRecordReader
   /**
    * Closes the record reader resources.
    */
-  public void close()
-    throws IOException {
+  public void close() throws IOException {
     this.in.close();
   }
 
@@ -137,63 +147,64 @@ public class ArcRecordReader
    * 
    * @return The long of the current position in the file.
    */
-  public long getPos()
-    throws IOException {
+  public long getPos() throws IOException {
     return in.getPos();
   }
 
   /**
-   * Returns the percentage of progress in processing the file.  This will be
+   * Returns the percentage of progress in processing the file. This will be
    * represented as a float from 0 to 1 with 1 being 100% completed.
    * 
    * @return The percentage of progress as a float from 0 to 1.
    */
-  public float getProgress()
-    throws IOException {
-	  
+  public float getProgress() throws IOException {
+
     // if we haven't even started
     if (splitEnd == splitStart) {
       return 0.0f;
-    }
-    else {
-      // the progress is current pos - where we started  / length of the split
-      return Math.min(1.0f, (getPos() - splitStart) / (float)splitLen);
+    } else {
+      // the progress is current pos - where we started / length of the split
+      return Math.min(1.0f, (getPos() - splitStart) / (float) splitLen);
     }
   }
 
   /**
-   * <p>Returns true if the next record in the split is read into the key and 
-   * value pair.  The key will be the arc record header and the values will be
-   * the raw content bytes of the arc record.</p>
+   * <p>
+   * Returns true if the next record in the split is read into the key and value
+   * pair. The key will be the arc record header and the values will be the raw
+   * content bytes of the arc record.
+   * </p>
    * 
-   * @param key The record key
-   * @param value The record value
+   * @param key
+   *          The record key
+   * @param value
+   *          The record value
    * 
    * @return True if the next record is read.
    * 
-   * @throws IOException If an error occurs while reading the record value.
+   * @throws IOException
+   *           If an error occurs while reading the record value.
    */
-  public boolean next(Text key, BytesWritable value)
-    throws IOException {
+  public boolean next(Text key, BytesWritable value) throws IOException {
 
     try {
-      
+
       // get the starting position on the input stream
       long startRead = in.getPos();
       byte[] magicBuffer = null;
-      
+
       // we need this loop to handle false positives in reading of gzip records
       while (true) {
-        
+
         // while we haven't passed the end of the split
         if (startRead >= splitEnd) {
           return false;
         }
-        
+
         // scanning for the gzip header
         boolean foundStart = false;
         while (!foundStart) {
-          
+
           // start at the current file position and scan for 1K at time, break
           // if there is no more to read
           startRead = in.getPos();
@@ -202,13 +213,13 @@ public class ArcRecordReader
           if (read < 0) {
             break;
           }
-          
-          // scan the byte array for the gzip header magic number.  This happens
+
+          // scan the byte array for the gzip header magic number. This happens
           // byte by byte
           for (int i = 0; i < read - 1; i++) {
             byte[] testMagic = new byte[2];
-            System.arraycopy(magicBuffer, i, testMagic, 0, 2);            
-            if (isMagic(testMagic)) {              
+            System.arraycopy(magicBuffer, i, testMagic, 0, 2);
+            if (isMagic(testMagic)) {
               // set the next start to the current gzip header
               startRead += i;
               foundStart = true;
@@ -216,14 +227,14 @@ public class ArcRecordReader
             }
           }
         }
-        
+
         // seek to the start of the gzip header
         in.seek(startRead);
         ByteArrayOutputStream baos = null;
         int totalRead = 0;
 
         try {
-          
+
           // read 4K of the gzip at a time putting into a byte array
           byte[] buffer = new byte[4096];
           GZIPInputStream zin = new GZIPInputStream(in);
@@ -233,9 +244,8 @@ public class ArcRecordReader
             baos.write(buffer, 0, gzipRead);
             totalRead += gzipRead;
           }
-        }
-        catch (Exception e) {
-          
+        } catch (Exception e) {
+
           // there are times we get false positives where the gzip header exists
           // but it is not an actual gzip record, so we ignore it and start
           // over seeking
@@ -248,7 +258,7 @@ public class ArcRecordReader
 
         // change the output stream to a byte array
         byte[] content = baos.toByteArray();
-        
+
         // the first line of the raw content in arc files is the header
         int eol = 0;
         for (int i = 0; i < content.length; i++) {
@@ -257,34 +267,33 @@ public class ArcRecordReader
             break;
           }
         }
-        
+
         // create the header and the raw content minus the header
         String header = new String(content, 0, eol).trim();
         byte[] raw = new byte[(content.length - eol) - 1];
         System.arraycopy(content, eol + 1, raw, 0, raw.length);
-        
+
         // populate key and values with the header and raw content.
         Text keyText = key;
         keyText.set(header);
         BytesWritable valueBytes = value;
         valueBytes.set(raw, 0, raw.length);
 
-        // TODO: It would be best to start at the end of the gzip read but 
-        // the bytes read in gzip don't match raw bytes in the file so we 
-        // overshoot the next header.  With this current method you get
+        // TODO: It would be best to start at the end of the gzip read but
+        // the bytes read in gzip don't match raw bytes in the file so we
+        // overshoot the next header. With this current method you get
         // some false positives but don't miss records.
         if (startRead + 1 < fileLen) {
           in.seek(startRead + 1);
         }
-        
+
         // populated the record, now return
         return true;
       }
+    } catch (Exception e) {
+      LOG.equals(StringUtils.stringifyException(e));
     }
-    catch (Exception e) {
-      LOG.equals(StringUtils.stringifyException(e));      
-    }
-    
+
     // couldn't populate the record or there is no next record to read
     return false;
   }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/package-info.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/package-info.java Fri Jan  9 06:34:33 2015
@@ -20,3 +20,4 @@
  * <a href="http://archive.org/web/researcher/ArcFileFormat.php">Arc file format</a>.
  */
 package org.apache.nutch.tools.arc;
+

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/package-info.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/package-info.java Fri Jan  9 06:34:33 2015
@@ -19,3 +19,4 @@
  * Miscellaneous tools.
  */
 package org.apache.nutch.tools;
+

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java Fri Jan  9 06:34:33 2015
@@ -15,6 +15,7 @@
  * limitations under the License.
  ******************************************************************************/
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -46,16 +47,17 @@ public abstract class AbstractTestbedHan
 
   @Override
   public void handle(String target, HttpServletRequest req,
-          HttpServletResponse res, int dispatch) throws IOException,
-          ServletException {
-    Request base_request = (req instanceof Request) ? (Request)req : HttpConnection.getCurrentConnection().getRequest();
+      HttpServletResponse res, int dispatch) throws IOException,
+      ServletException {
+    Request base_request = (req instanceof Request) ? (Request) req
+        : HttpConnection.getCurrentConnection().getRequest();
     res.addHeader("X-TestbedHandlers", this.getClass().getSimpleName());
     handle(base_request, res, target, dispatch);
   }
-  
-  public abstract void handle(Request req, HttpServletResponse res, String target,
-          int dispatch) throws IOException, ServletException;
-  
+
+  public abstract void handle(Request req, HttpServletResponse res,
+      String target, int dispatch) throws IOException, ServletException;
+
   public void addMyHeader(HttpServletResponse res, String name, String value) {
     name = "X-" + this.getClass().getSimpleName() + "-" + name;
     res.addHeader(name, value);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/DelayHandler.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/DelayHandler.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/DelayHandler.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/DelayHandler.java Fri Jan  9 06:34:33 2015
@@ -15,6 +15,7 @@
  * limitations under the License.
  ******************************************************************************/
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -41,13 +42,13 @@ import javax.servlet.http.HttpServletRes
 import org.mortbay.jetty.Request;
 
 public class DelayHandler extends AbstractTestbedHandler {
-  
+
   public static final long DEFAULT_DELAY = 2000;
-  
+
   private int delay;
   private boolean random;
   private Random r;
-  
+
   public DelayHandler(int delay) {
     if (delay < 0) {
       delay = -delay;
@@ -59,13 +60,13 @@ public class DelayHandler extends Abstra
 
   @Override
   public void handle(Request req, HttpServletResponse res, String target,
-          int dispatch) throws IOException, ServletException {
+      int dispatch) throws IOException, ServletException {
     try {
       int del = random ? r.nextInt(delay) : delay;
       Thread.sleep(del);
       addMyHeader(res, "Delay", String.valueOf(del));
     } catch (Exception e) {
-      
+
     }
   }
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java Fri Jan  9 06:34:33 2015
@@ -15,6 +15,7 @@
  * limitations under the License.
  ******************************************************************************/
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -44,9 +45,14 @@ import org.mortbay.jetty.HttpURI;
 import org.mortbay.jetty.Request;
 
 public class FakeHandler extends AbstractTestbedHandler {
-  /** Create links to hosts generated from a pool of numHosts/numPages random names. */
-  public static enum Mode {UNIQUE, RANDOM};
-    
+  /**
+   * Create links to hosts generated from a pool of numHosts/numPages random
+   * names.
+   */
+  public static enum Mode {
+    UNIQUE, RANDOM
+  };
+
   int numInternalLinks;
   int numExternalLinks;
   Mode hostMode;
@@ -55,34 +61,36 @@ public class FakeHandler extends Abstrac
   AtomicLong pageSeq = new AtomicLong(0);
   int numHosts;
   int numPages;
-  
+
   Random r = new Random(1234567890L); // predictable
   Random pageR;
 
-  private static final String testA = 
-    "<html><body><h1>Internet Weather Forecast Accuracy</h1>\n" + 
-    "<p>Weather forecasting is a secure and popular online presence, which is understandable. The weather affects most everyone's life, and the Internet can provide information on just about any location at any hour of the day or night. But how accurate is this information? How much can we trust it? Perhaps it is just my skeptical nature (or maybe the seeming unpredictability of nature), but I've never put much weight into weather forecasts - especially those made more than three days in advance. That skepticism progressed to a new high in the Summer of 2004, but I have only now done the research necessary to test the accuracy of online weather forecasts. First the story, then the data.</p>" +
-    "<h2>An Internet Weather Forecast Gone Terribly Awry</h2>" +
-    "<p>It was the Summer of 2004 and my wife and I were gearing up for a trip with another couple to Schlitterbahn in New Braunfels - one of the (if not the) best waterparks ever created. As a matter of course when embarking on a 2.5-hour drive to spend the day in a swimsuit, and given the tendency of the area for natural disasters, we checked the weather. The temperatures looked ideal and, most importantly, the chance of rain was a nice round goose egg.</p>";
-  private static final String testB =
-    "<p>A couple of hours into our Schlitterbahn experience, we got on a bus to leave the 'old section' for the 'new section.' Along the way, clouds gathered and multiple claps of thunder sounded. 'So much for the 0% chance of rain,' I commented. By the time we got to our destination, lightning sightings had led to the slides and pools being evacuated and soon the rain began coming down in torrents - accompanied by voluminous lightning flashes. After at least a half an hour the downpour had subsided, but the lightning showed no sign of letting up, so we began heading back to our vehicles. A hundred yards into the parking lot, we passing a tree that had apparently been split in two during the storm (whether by lightning or wind, I'm not sure). Not but a few yards later, there was a distinct thud and the husband of the couple accompanying us cried out as a near racquetball sized hunk of ice rebounded off of his head and onto the concrete. Soon, similarly sized hail was falling all aro
 und us as everyone scampered for cover. Some cowered under overturned trashcans while others were more fortunate and made it indoors.</p>" +
-    "<p>The hail, rain and lightning eventually subsided, but the most alarming news was waiting on cell phone voicemail. A friend who lived in the area had called frantically, knowing we were at the park, as the local news was reporting multiple people had been by struck by lightning at Schlitterbahn during the storm.</p>" +
-    "<p>'So much for the 0% chance of rain,' I repeated.</p></body></html>";
+  private static final String testA = "<html><body><h1>Internet Weather Forecast Accuracy</h1>\n"
+      + "<p>Weather forecasting is a secure and popular online presence, which is understandable. The weather affects most everyone's life, and the Internet can provide information on just about any location at any hour of the day or night. But how accurate is this information? How much can we trust it? Perhaps it is just my skeptical nature (or maybe the seeming unpredictability of nature), but I've never put much weight into weather forecasts - especially those made more than three days in advance. That skepticism progressed to a new high in the Summer of 2004, but I have only now done the research necessary to test the accuracy of online weather forecasts. First the story, then the data.</p>"
+      + "<h2>An Internet Weather Forecast Gone Terribly Awry</h2>"
+      + "<p>It was the Summer of 2004 and my wife and I were gearing up for a trip with another couple to Schlitterbahn in New Braunfels - one of the (if not the) best waterparks ever created. As a matter of course when embarking on a 2.5-hour drive to spend the day in a swimsuit, and given the tendency of the area for natural disasters, we checked the weather. The temperatures looked ideal and, most importantly, the chance of rain was a nice round goose egg.</p>";
+  private static final String testB = "<p>A couple of hours into our Schlitterbahn experience, we got on a bus to leave the 'old section' for the 'new section.' Along the way, clouds gathered and multiple claps of thunder sounded. 'So much for the 0% chance of rain,' I commented. By the time we got to our destination, lightning sightings had led to the slides and pools being evacuated and soon the rain began coming down in torrents - accompanied by voluminous lightning flashes. After at least a half an hour the downpour had subsided, but the lightning showed no sign of letting up, so we began heading back to our vehicles. A hundred yards into the parking lot, we passing a tree that had apparently been split in two during the storm (whether by lightning or wind, I'm not sure). Not but a few yards later, there was a distinct thud and the husband of the couple accompanying us cried out as a near racquetball sized hunk of ice rebounded off of his head and onto the concrete. Soon, simila
 rly sized hail was falling all around us as everyone scampered for cover. Some cowered under overturned trashcans while others were more fortunate and made it indoors.</p>"
+      + "<p>The hail, rain and lightning eventually subsided, but the most alarming news was waiting on cell phone voicemail. A friend who lived in the area had called frantically, knowing we were at the park, as the local news was reporting multiple people had been by struck by lightning at Schlitterbahn during the storm.</p>"
+      + "<p>'So much for the 0% chance of rain,' I repeated.</p></body></html>";
 
   /**
    * Create fake pages.
-   * @param hostMode if UNIQUE then each external outlink will use a unique host name. If
-   * RANDOM then each outlink will use a host name allocated from pool of numHosts.
-   * @param pageMode if UNIQUE then each internal outlinks will use a unique page name.
-   * if RANDOM then each outlink will use a page name allocated from pool of numPages.
+   * 
+   * @param hostMode
+   *          if UNIQUE then each external outlink will use a unique host name.
+   *          If RANDOM then each outlink will use a host name allocated from
+   *          pool of numHosts.
+   * @param pageMode
+   *          if UNIQUE then each internal outlinks will use a unique page name.
+   *          if RANDOM then each outlink will use a page name allocated from
+   *          pool of numPages.
    * @param numInternalLinks
    * @param numExternalLinks
    * @param numHosts
    * @param numPages
    */
-  public FakeHandler(Mode hostMode, Mode pageMode,
-      int numInternalLinks, int numExternalLinks,
-      int numHosts, int numPages) {
+  public FakeHandler(Mode hostMode, Mode pageMode, int numInternalLinks,
+      int numExternalLinks, int numHosts, int numPages) {
     this.numExternalLinks = numExternalLinks;
     this.numInternalLinks = numInternalLinks;
     this.numHosts = numHosts;
@@ -90,10 +98,10 @@ public class FakeHandler extends Abstrac
     this.hostMode = hostMode;
     this.pageMode = pageMode;
   }
-  
+
   @Override
-  public void handle(Request req, HttpServletResponse res, String target, 
-          int dispatch) throws IOException, ServletException {
+  public void handle(Request req, HttpServletResponse res, String target,
+      int dispatch) throws IOException, ServletException {
     HttpURI u = req.getUri();
     String uri = u.toString();
     addMyHeader(res, "URI", uri);
@@ -126,7 +134,7 @@ public class FakeHandler extends Abstrac
       for (int i = 0; i < numInternalLinks; i++) {
         String link = "<p><a href='";
         if (pageMode.equals(Mode.RANDOM)) {
-          link += pageR.nextInt (numPages) + ".html'>";
+          link += pageR.nextInt(numPages) + ".html'>";
         } else {
           if (!basePath.endsWith("/")) {
             link += "/";
@@ -157,13 +165,14 @@ public class FakeHandler extends Abstrac
       }
       // fake a link to the root URL
       link = "<p><a href='" + u.getScheme() + "://" + u.getHost();
-      if (u.getPort() != 80 && u.getPort() != -1) link += ":" + u.getPort();
+      if (u.getPort() != 80 && u.getPort() != -1)
+        link += ":" + u.getPort();
       link += "/'>site " + u.getHost() + "</a></p>\r\n";
       os.write(link.getBytes());
       os.write(testB.getBytes());
       res.flushBuffer();
     } catch (IOException ioe) {
-    }    
+    }
   }
 
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/LogDebugHandler.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/LogDebugHandler.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/LogDebugHandler.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/LogDebugHandler.java Fri Jan  9 06:34:33 2015
@@ -15,6 +15,7 @@
  * limitations under the License.
  ******************************************************************************/
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -47,29 +48,33 @@ import org.slf4j.LoggerFactory;
 import org.mortbay.jetty.Request;
 
 public class LogDebugHandler extends AbstractTestbedHandler implements Filter {
-  private static final Logger LOG = LoggerFactory.getLogger(LogDebugHandler.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(LogDebugHandler.class);
 
   @Override
   public void handle(Request req, HttpServletResponse res, String target,
-          int dispatch) throws IOException, ServletException {
-    LOG.info("-- " + req.getMethod() + " " + req.getUri().toString() + "\n" + req.getConnection().getRequestFields());
+      int dispatch) throws IOException, ServletException {
+    LOG.info("-- " + req.getMethod() + " " + req.getUri().toString() + "\n"
+        + req.getConnection().getRequestFields());
   }
 
   @Override
   public void doFilter(ServletRequest req, ServletResponse res,
-          FilterChain chain) throws IOException, ServletException {
-    ((HttpServletResponse)res).addHeader("X-Handled-By", "AsyncProxyHandler");
-    ((HttpServletResponse)res).addHeader("X-TestbedHandlers", "AsyncProxyHandler");
+      FilterChain chain) throws IOException, ServletException {
+    ((HttpServletResponse) res).addHeader("X-Handled-By", "AsyncProxyHandler");
+    ((HttpServletResponse) res).addHeader("X-TestbedHandlers",
+        "AsyncProxyHandler");
     try {
       chain.doFilter(req, res);
     } catch (Throwable e) {
-      ((HttpServletResponse)res).sendError(HttpServletResponse.SC_BAD_REQUEST, e.toString());
+      ((HttpServletResponse) res).sendError(HttpServletResponse.SC_BAD_REQUEST,
+          e.toString());
     }
   }
 
   @Override
   public void init(FilterConfig arg0) throws ServletException {
     // TODO Auto-generated method stub
-    
+
   }
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/NotFoundHandler.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/NotFoundHandler.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/NotFoundHandler.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/NotFoundHandler.java Fri Jan  9 06:34:33 2015
@@ -15,6 +15,7 @@
  * limitations under the License.
  ******************************************************************************/
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -43,13 +44,13 @@ public class NotFoundHandler extends Abs
 
   @Override
   public void handle(Request req, HttpServletResponse res, String target,
-          int dispatch) throws IOException, ServletException {
+      int dispatch) throws IOException, ServletException {
     // don't pass it down the chain
     req.setHandled(true);
     res.addHeader("X-Handled-By", getClass().getSimpleName());
     addMyHeader(res, "URI", req.getUri().toString());
-    res.sendError(HttpServletResponse.SC_NOT_FOUND, "Not found: " +
-            req.getUri().toString());
+    res.sendError(HttpServletResponse.SC_NOT_FOUND, "Not found: "
+        + req.getUri().toString());
   }
 
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java Fri Jan  9 06:34:33 2015
@@ -15,6 +15,7 @@
  * limitations under the License.
  ******************************************************************************/
 package org.apache.nutch.tools.proxy;
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -52,19 +53,32 @@ public class TestbedProxy {
    */
   public static void main(String[] args) throws Exception {
     if (args.length == 0) {
-      System.err.println("TestbedProxy [-port <nnn>] [-forward] [-fake [...]] [-delay nnn] [-debug]");
-      System.err.println("-port <nnn>\trun the proxy on port <nnn> (special permissions may be needed for ports < 1024)");
-      System.err.println("-forward\tif specified, requests to all unknown urls will be passed to");
-      System.err.println("\t\toriginal servers. If false (default) unknown urls generate 404 Not Found.");
-      System.err.println("-delay\tdelay every response by nnn seconds. If delay is negative use a random value up to nnn");
-      System.err.println("-fake\tif specified, requests to all unknown urls will succeed with fake content");
-      System.err.println("\nAdditional options for -fake handler (all optional):");
-      System.err.println("\t-hostMode (u | r)\tcreate unique host names, or pick random from a pool");
-      System.err.println("\t-pageMode (u | r)\tcreate unique page names, or pick random from a pool");
-      System.err.println("\t-numHosts N\ttotal number of hosts when using hostMode r");
-      System.err.println("\t-numPages N\ttotal number of pages per host when using pageMode r");
-      System.err.println("\t-intLinks N\tnumber of internal (same host) links per page");
-      System.err.println("\t-extLinks N\tnumber of external (other host) links per page");
+      System.err
+          .println("TestbedProxy [-port <nnn>] [-forward] [-fake [...]] [-delay nnn] [-debug]");
+      System.err
+          .println("-port <nnn>\trun the proxy on port <nnn> (special permissions may be needed for ports < 1024)");
+      System.err
+          .println("-forward\tif specified, requests to all unknown urls will be passed to");
+      System.err
+          .println("\t\toriginal servers. If false (default) unknown urls generate 404 Not Found.");
+      System.err
+          .println("-delay\tdelay every response by nnn seconds. If delay is negative use a random value up to nnn");
+      System.err
+          .println("-fake\tif specified, requests to all unknown urls will succeed with fake content");
+      System.err
+          .println("\nAdditional options for -fake handler (all optional):");
+      System.err
+          .println("\t-hostMode (u | r)\tcreate unique host names, or pick random from a pool");
+      System.err
+          .println("\t-pageMode (u | r)\tcreate unique page names, or pick random from a pool");
+      System.err
+          .println("\t-numHosts N\ttotal number of hosts when using hostMode r");
+      System.err
+          .println("\t-numPages N\ttotal number of pages per host when using pageMode r");
+      System.err
+          .println("\t-intLinks N\tnumber of internal (same host) links per page");
+      System.err
+          .println("\t-extLinks N\tnumber of external (other host) links per page");
       System.err.println("\nDefaults for -fake handler:");
       System.err.println("\t-hostMode r");
       System.err.println("\t-pageMode r");
@@ -74,7 +88,7 @@ public class TestbedProxy {
       System.err.println("\t-extLinks 5");
       System.exit(-1);
     }
-    
+
     Configuration conf = NutchConfiguration.create();
     int port = conf.getInt("batch.proxy.port", 8181);
     boolean forward = false;
@@ -88,7 +102,7 @@ public class TestbedProxy {
     int numPages = 10000;
     int intLinks = 10;
     int extLinks = 5;
-    
+
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-port")) {
         port = Integer.parseInt(args[++i]);
@@ -122,28 +136,30 @@ public class TestbedProxy {
         System.exit(-1);
       }
     }
-    
+
     // Create the server
     Server server = new Server();
     SocketConnector connector = new SocketConnector();
     connector.setPort(port);
     connector.setResolveNames(false);
     server.addConnector(connector);
-    
+
     // create a list of handlers
     HandlerList list = new HandlerList();
     server.addHandler(list);
-    
+
     if (debug) {
       LOG.info("* Added debug handler.");
       list.addHandler(new LogDebugHandler());
     }
- 
+
     if (delay) {
-      LOG.info("* Added delay handler: " + (delayVal < 0 ? "random delay up to " + (-delayVal) : "constant delay of " + delayVal));
+      LOG.info("* Added delay handler: "
+          + (delayVal < 0 ? "random delay up to " + (-delayVal)
+              : "constant delay of " + delayVal));
       list.addHandler(new DelayHandler(delayVal));
     }
-    
+
     // XXX alternatively, we can add the DispatchHandler as the first one,
     // XXX to activate handler plugins and redirect requests to appropriate
     // XXX handlers ... Here we always load these handlers

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/package-info.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/package-info.java Fri Jan  9 06:34:33 2015
@@ -19,3 +19,4 @@
  * Proxy to {@link org.apache.nutch.tools.Benchmark benchmark} the crawler.
  */
 package org.apache.nutch.tools.proxy;
+