You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2010/07/30 23:38:37 UTC
svn commit: r980974 - in /nutch/trunk: build.xml
src/test/org/apache/nutch/tools/Benchmark.java
src/test/org/apache/nutch/tools/proxy/FakeHandler.java
Author: ab
Date: Fri Jul 30 21:38:36 2010
New Revision: 980974
URL: http://svn.apache.org/viewvc?rev=980974&view=rev
Log:
Fix a silly last-minute error and improve the benchmark.
Modified:
nutch/trunk/build.xml
nutch/trunk/src/test/org/apache/nutch/tools/Benchmark.java
nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java
Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=980974&r1=980973&r2=980974&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Fri Jul 30 21:38:36 2010
@@ -234,8 +234,12 @@
<java classname="org.apache.nutch.tools.Benchmark" fork="true">
<classpath refid="test.classpath"/>
<jvmarg line="-Xmx512m -Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/>
- <arg value="-seeds"/>
+ <arg value="-maxPerHost"/>
<arg value="10"/>
+ <arg value="-seeds"/>
+ <arg value="1"/>
+ <arg value="-depth"/>
+ <arg value="5"/>
</java>
</target>
Modified: nutch/trunk/src/test/org/apache/nutch/tools/Benchmark.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/Benchmark.java?rev=980974&r1=980973&r2=980974&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/Benchmark.java Fri Jul 30 21:38:36 2010
@@ -17,6 +17,7 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.Crawl;
import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.crawl.CrawlDbReader;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.Injector;
import org.apache.nutch.crawl.LinkDb;
@@ -61,7 +62,7 @@ public class Benchmark extends Configure
long topN = Long.MAX_VALUE;
if (args.length == 0) {
- System.err.println("Usage: Benchmark [-seeds NN] [-depth NN] [-threads NN] [-keep] [-plugins <regex>]");
+ System.err.println("Usage: Benchmark [-seeds NN] [-depth NN] [-threads NN] [-keep] [-maxPerHost NN] [-plugins <regex>]");
System.err.println("\t-seeds NN\tcreate NN unique hosts in a seed list (default: 1)");
System.err.println("\t-depth NN\tperform NN crawl cycles (default: 10)");
System.err.println("\t-threads NN\tuse NN threads per Fetcher task (default: 10)");
@@ -69,8 +70,10 @@ public class Benchmark extends Configure
System.err.println("\t-plugins <regex>\toverride 'plugin.includes'.");
System.err.println("\tNOTE: if not specified, this is reset to: " + plugins);
System.err.println("\tNOTE: if 'default' is specified then a value set in nutch-default/nutch-site is used.");
+ System.err.println("\t-maxPerHost NN\tmax. # of URLs per host in a fetchlist");
return -1;
}
+ int maxPerHost = Integer.MAX_VALUE;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-seeds")) {
seeds = Integer.parseInt(args[++i]);
@@ -82,6 +85,8 @@ public class Benchmark extends Configure
delete = false;
} else if (args[i].equals("-plugins")) {
plugins = args[++i];
+ } else if (args[i].equalsIgnoreCase("-maxPerHost")) {
+ maxPerHost = Integer.parseInt(args[++i]);
} else {
LOG.fatal("Invalid argument: '" + args[i] + "'");
return -1;
@@ -91,9 +96,12 @@ public class Benchmark extends Configure
conf.set("http.proxy.host", "localhost");
conf.setInt("http.proxy.port", 8181);
conf.set("http.agent.name", "test");
+ conf.set("http.robots.agents", "test,*");
if (!plugins.equals("default")) {
conf.set("plugin.includes", plugins);
}
+ conf.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
+ conf.set(Generator.GENERATOR_COUNT_MODE, Generator.GENERATOR_COUNT_VALUE_HOST);
JobConf job = new NutchJob(getConf());
FileSystem fs = FileSystem.get(job);
Path dir = new Path(getConf().get("hadoop.tmp.dir"),
@@ -136,6 +144,7 @@ public class Benchmark extends Configure
parseSegment.parse(segs[0]); // parse it, if needed
}
crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
+ linkDbTool.invert(linkDb, segs, true, true, false); // invert links
// delete data
if (delete) {
for (Path p : segs) {
@@ -143,14 +152,14 @@ public class Benchmark extends Configure
}
}
}
- if (i > 0) {
- linkDbTool.invert(linkDb, segments, true, true, false); // invert links
- } else {
+ if (i == 0) {
LOG.warn("No URLs to fetch - check your seed list and URL filters.");
}
if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
long end = System.currentTimeMillis();
LOG.info("TOTAL TIME: " + (end - start)/1000 + " sec");
+ CrawlDbReader dbreader = new CrawlDbReader();
+ dbreader.processStatJob(crawlDb.toString(), conf, false);
return 0;
}
Modified: nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java?rev=980974&r1=980973&r2=980974&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java Fri Jul 30 21:38:36 2010
@@ -63,7 +63,7 @@ public class FakeHandler extends Abstrac
// fake some links
String base;
if (u.getPath().length() > 5) {
- base = u.getPath().substring(0, uri.length() - 5);
+ base = u.getPath().substring(0, u.getPath().length() - 5);
} else {
base = u.getPath();
}
@@ -79,12 +79,14 @@ public class FakeHandler extends Abstrac
link += i + ".html'>outlink " + i + "</a></p>\r\n";
os.write(link.getBytes());
}
- // fake a link to a random nonexistent host
- int h = r.nextInt(1000000); // 1 mln hosts
- String link = "<p><a href='http://www.fake-" + h + ".com/'>fake host " + h + "</a></p>\r\n";
- os.write(link.getBytes());
+ // fake a few links to random nonexistent hosts
+ for (int i = 0; i < 5; i++) {
+ int h = r.nextInt(1000000); // 1 mln hosts
+ String link = "<p><a href='http://www.fake-" + h + ".com/'>fake host " + h + "</a></p>\r\n";
+ os.write(link.getBytes());
+ }
// fake a link to the root URL
- link = "<p><a href='" + u.getScheme() + "://" + u.getHost();
+ String link = "<p><a href='" + u.getScheme() + "://" + u.getHost();
if (u.getPort() != 80 && u.getPort() != -1) link += ":" + u.getPort();
link += "/'>site " + u.getHost() + "</a></p>\r\n";
os.write(link.getBytes());