You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2010/07/30 23:38:37 UTC

svn commit: r980974 - in /nutch/trunk: build.xml src/test/org/apache/nutch/tools/Benchmark.java src/test/org/apache/nutch/tools/proxy/FakeHandler.java

Author: ab
Date: Fri Jul 30 21:38:36 2010
New Revision: 980974

URL: http://svn.apache.org/viewvc?rev=980974&view=rev
Log:
Fix a silly last-minute error and improve the benchmark.

Modified:
    nutch/trunk/build.xml
    nutch/trunk/src/test/org/apache/nutch/tools/Benchmark.java
    nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java

Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=980974&r1=980973&r2=980974&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Fri Jul 30 21:38:36 2010
@@ -234,8 +234,12 @@
     <java classname="org.apache.nutch.tools.Benchmark" fork="true">
       <classpath refid="test.classpath"/>
       <jvmarg line="-Xmx512m -Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/>
-      <arg value="-seeds"/>
+      <arg value="-maxPerHost"/>
       <arg value="10"/>
+      <arg value="-seeds"/>
+      <arg value="1"/>
+      <arg value="-depth"/>
+      <arg value="5"/>
     </java>
   </target>
 

Modified: nutch/trunk/src/test/org/apache/nutch/tools/Benchmark.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/Benchmark.java?rev=980974&r1=980973&r2=980974&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/Benchmark.java Fri Jul 30 21:38:36 2010
@@ -17,6 +17,7 @@ import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.Crawl;
 import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.crawl.CrawlDbReader;
 import org.apache.nutch.crawl.Generator;
 import org.apache.nutch.crawl.Injector;
 import org.apache.nutch.crawl.LinkDb;
@@ -61,7 +62,7 @@ public class Benchmark extends Configure
     long topN = Long.MAX_VALUE;
     
     if (args.length == 0) {
-      System.err.println("Usage: Benchmark [-seeds NN] [-depth NN] [-threads NN] [-keep] [-plugins <regex>]");
+      System.err.println("Usage: Benchmark [-seeds NN] [-depth NN] [-threads NN] [-keep] [-maxPerHost NN] [-plugins <regex>]");
       System.err.println("\t-seeds NN\tcreate NN unique hosts in a seed list (default: 1)");
       System.err.println("\t-depth NN\tperform NN crawl cycles (default: 10)");
       System.err.println("\t-threads NN\tuse NN threads per Fetcher task (default: 10)");
@@ -69,8 +70,10 @@ public class Benchmark extends Configure
       System.err.println("\t-plugins <regex>\toverride 'plugin.includes'.");
       System.err.println("\tNOTE: if not specified, this is reset to: " + plugins);
       System.err.println("\tNOTE: if 'default' is specified then a value set in nutch-default/nutch-site is used.");
+      System.err.println("\t-maxPerHost NN\tmax. # of URLs per host in a fetchlist");
       return -1;
     }
+    int maxPerHost = Integer.MAX_VALUE;
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-seeds")) {
         seeds = Integer.parseInt(args[++i]);
@@ -82,6 +85,8 @@ public class Benchmark extends Configure
         delete = false;
       } else if (args[i].equals("-plugins")) {
         plugins = args[++i];
+      } else if (args[i].equalsIgnoreCase("-maxPerHost")) {
+        maxPerHost = Integer.parseInt(args[++i]);
       } else {
         LOG.fatal("Invalid argument: '" + args[i] + "'");
         return -1;
@@ -91,9 +96,12 @@ public class Benchmark extends Configure
     conf.set("http.proxy.host", "localhost");
     conf.setInt("http.proxy.port", 8181);
     conf.set("http.agent.name", "test");
+    conf.set("http.robots.agents", "test,*");
     if (!plugins.equals("default")) {
       conf.set("plugin.includes", plugins);
     }
+    conf.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
+    conf.set(Generator.GENERATOR_COUNT_MODE, Generator.GENERATOR_COUNT_VALUE_HOST);
     JobConf job = new NutchJob(getConf());    
     FileSystem fs = FileSystem.get(job);
     Path dir = new Path(getConf().get("hadoop.tmp.dir"),
@@ -136,6 +144,7 @@ public class Benchmark extends Configure
         parseSegment.parse(segs[0]);    // parse it, if needed
       }
       crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
+      linkDbTool.invert(linkDb, segs, true, true, false); // invert links
       // delete data
       if (delete) {
         for (Path p : segs) {
@@ -143,14 +152,14 @@ public class Benchmark extends Configure
         }
       }
     }
-    if (i > 0) {
-      linkDbTool.invert(linkDb, segments, true, true, false); // invert links
-    } else {
+    if (i == 0) {
       LOG.warn("No URLs to fetch - check your seed list and URL filters.");
     }
     if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
     long end = System.currentTimeMillis();
     LOG.info("TOTAL TIME: " + (end - start)/1000 + " sec");
+    CrawlDbReader dbreader = new CrawlDbReader();
+    dbreader.processStatJob(crawlDb.toString(), conf, false);
     return 0;
   }
 

Modified: nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java?rev=980974&r1=980973&r2=980974&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java Fri Jul 30 21:38:36 2010
@@ -63,7 +63,7 @@ public class FakeHandler extends Abstrac
       // fake some links
       String base;
       if (u.getPath().length() > 5) {
-        base = u.getPath().substring(0, uri.length() - 5);
+        base = u.getPath().substring(0, u.getPath().length() - 5);
       } else {
         base = u.getPath();
       }
@@ -79,12 +79,14 @@ public class FakeHandler extends Abstrac
         link += i + ".html'>outlink " + i + "</a></p>\r\n";
         os.write(link.getBytes());
       }
-      // fake a link to a random nonexistent host
-      int h = r.nextInt(1000000); // 1 mln hosts
-      String link = "<p><a href='http://www.fake-" + h + ".com/'>fake host " + h + "</a></p>\r\n";
-      os.write(link.getBytes());
+      // fake a few links to random nonexistent hosts
+      for (int i = 0; i < 5; i++) {
+        int h = r.nextInt(1000000); // 1 mln hosts
+        String link = "<p><a href='http://www.fake-" + h + ".com/'>fake host " + h + "</a></p>\r\n";
+        os.write(link.getBytes());
+      }
       // fake a link to the root URL
-      link = "<p><a href='" + u.getScheme() + "://" + u.getHost();
+      String link = "<p><a href='" + u.getScheme() + "://" + u.getHost();
       if (u.getPort() != 80 && u.getPort() != -1) link += ":" + u.getPort();
       link += "/'>site " + u.getHost() + "</a></p>\r\n";
       os.write(link.getBytes());