You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/11/19 19:48:40 UTC
svn commit: r476879 - in /lucene/nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/test/ src/test/org/apache/nutch/crawl/
src/test/org/apache/nutch/fetcher/
Author: siren
Date: Sun Nov 19 10:48:39 2006
New Revision: 476879
URL: http://svn.apache.org/viewvc?view=rev&rev=476879
Log:
NUTCH-403 Make URL filtering optional in Generator
Added:
lucene/nutch/trunk/src/test/filter-all.txt
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=476879&r1=476878&r2=476879
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Nov 19 10:48:39 2006
@@ -78,6 +78,8 @@
25. NUTCH-404 - Fix LinkDB Usage - implementation mismatch (siren)
+26. NUTCH-403 - Make URL filtering optional in Generator (siren)
+
Release 0.8 - 2006-07-25
0. Totally new architecture, based on hadoop
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diff&rev=476879&r1=476878&r2=476879
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Sun Nov 19 10:48:39 2006
@@ -115,9 +115,8 @@
injector.inject(crawlDb, rootUrlDir);
for (int i = 0; i < depth; i++) { // generate new segment
- Path segment =
- generator.generate(crawlDb, segments, -1,
- topN, System.currentTimeMillis());
+ Path segment = generator.generate(crawlDb, segments, -1, topN, System
+ .currentTimeMillis(), false);
fetcher.fetch(segment, threads); // fetch it
if (!Fetcher.isParsing(job)) {
parseSegment.parse(segment); // parse it, if needed
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=476879&r1=476878&r2=476879
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Sun Nov 19 10:48:39 2006
@@ -44,6 +44,7 @@
/** Generates a subset of a crawl db to fetch. */
public class Generator extends ToolBase {
+ public static final String CRAWL_GENERATE_FILTER = "crawl.generate.filter";
public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip";
public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host";
public static final String CRAWL_TOP_N = "crawl.topN";
@@ -89,6 +90,7 @@
private FloatWritable sortValue = new FloatWritable();
private boolean byIP;
private long dnsFailure = 0L;
+ private boolean filter;
public void configure(JobConf job) {
curTime = job.getLong(CRAWL_GEN_CUR_TIME, System.currentTimeMillis());
@@ -99,6 +101,7 @@
normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
scfilters = new ScoringFilters(job);
hostPartitioner.configure(job);
+ filter = job.getBoolean(CRAWL_GENERATE_FILTER, true);
}
public void close() {}
@@ -108,13 +111,16 @@
OutputCollector output, Reporter reporter)
throws IOException {
Text url = (Text)key;
- // don't generate URLs that don't pass URLFilters
- try {
- if (filters.filter(url.toString()) == null)
- return;
- } catch (URLFilterException e) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")");
+ if (filter) {
+ // If filtering is on don't generate URLs that don't pass URLFilters
+ try {
+ if (filters.filter(url.toString()) == null)
+ return;
+ } catch (URLFilterException e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage()
+ + ")");
+ }
}
}
CrawlDatum crawlDatum = (CrawlDatum)value;
@@ -291,13 +297,13 @@
/** Generate fetchlists in a segment. */
public Path generate(Path dbDir, Path segments)
throws IOException {
- return generate(dbDir, segments,
- -1, Long.MAX_VALUE, System.currentTimeMillis());
+ return generate(dbDir, segments, -1, Long.MAX_VALUE, System
+ .currentTimeMillis(), true);
}
/** Generate fetchlists in a segment. */
public Path generate(Path dbDir, Path segments,
- int numLists, long topN, long curTime)
+ int numLists, long topN, long curTime, boolean filter)
throws IOException {
Path tempDir =
@@ -308,10 +314,12 @@
Path segment = new Path(segments, generateSegmentName());
Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
- if (LOG.isInfoEnabled()) {
- LOG.info("Generator: starting");
- LOG.info("Generator: segment: " + segment);
- LOG.info("Generator: Selecting best-scoring urls due for fetch.");
+ LOG.info("Generator: Selecting best-scoring urls due for fetch.");
+ LOG.info("Generator: starting");
+ LOG.info("Generator: segment: " + segment);
+ LOG.info("Generator: filtering: " + filter);
+ if (topN != Long.MAX_VALUE) {
+ LOG.info("Generator: topN: " + topN);
}
// map to inverted subset due for fetch, sort by link count
@@ -326,8 +334,9 @@
LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
numLists = 1;
}
- job.setLong("crawl.gen.curTime", curTime);
- job.setLong("crawl.topN", topN);
+ job.setLong(CRAWL_GEN_CUR_TIME, curTime);
+ job.setLong(CRAWL_TOP_N, topN);
+ job.setBoolean(CRAWL_GENERATE_FILTER, filter);
job.setInputPath(new Path(dbDir, CrawlDatum.DB_DIR_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
@@ -393,7 +402,7 @@
public int run(String[] args) throws Exception {
if (args.length < 2) {
- System.out.println("Usage: Generator <crawldb> <segments_dir> [-topN N] [-numFetchers numFetchers] [-adddays numDays]");
+ System.out.println("Usage: Generator <crawldb> <segments_dir> [-topN N] [-numFetchers numFetchers] [-adddays numDays] [-noFilter]");
return -1;
}
@@ -402,6 +411,7 @@
long curTime = System.currentTimeMillis();
long topN = Long.MAX_VALUE;
int numFetchers = -1;
+ boolean filter = true;
for (int i = 2; i < args.length; i++) {
if ("-topN".equals(args[i])) {
@@ -413,14 +423,14 @@
} else if ("-adddays".equals(args[i])) {
long numDays = Integer.parseInt(args[i+1]);
curTime += numDays * 1000L * 60 * 60 * 24;
+ } else if ("-noFilter".equals(args[i])) {
+ filter = false;
}
+
}
- if ((LOG.isInfoEnabled()) && (topN != Long.MAX_VALUE)) {
- LOG.info("topN: " + topN);
- }
try {
- generate(dbDir, segmentsDir, numFetchers, topN, curTime);
+ generate(dbDir, segmentsDir, numFetchers, topN, curTime, filter);
return 0;
} catch (Exception e) {
LOG.fatal("Generator: " + StringUtils.stringifyException(e));
Added: lucene/nutch/trunk/src/test/filter-all.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/filter-all.txt?view=auto&rev=476879
==============================================================================
--- lucene/nutch/trunk/src/test/filter-all.txt (added)
+++ lucene/nutch/trunk/src/test/filter-all.txt Sun Nov 19 10:48:39 2006
@@ -0,0 +1,7 @@
+# Config file for urlfilter-suffix plugin
+# Filter away all urls
+
+# case-insensitive, disallow unknown suffixes
+-I
+
+# allow these
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?view=diff&rev=476879&r1=476878&r2=476879
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Sun Nov 19 10:48:39 2006
@@ -85,7 +85,7 @@
createCrawlDB(list);
- Path generatedSegment = generateFetchlist(NUM_RESULTS, conf);
+ Path generatedSegment = generateFetchlist(NUM_RESULTS, conf, false);
Path fetchlist = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -145,7 +145,8 @@
Configuration myConfiguration = new Configuration(conf);
myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1);
- Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+ Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+ myConfiguration, false);
Path fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -155,10 +156,10 @@
// verify we got right amount of records
assertEquals(1, fetchList.size());
-
myConfiguration = new Configuration(conf);
myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2);
- generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+ false);
fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -170,7 +171,8 @@
myConfiguration = new Configuration(conf);
myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3);
- generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+ false);
fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -180,7 +182,7 @@
// verify we got right amount of records
assertEquals(3, fetchList.size());
}
-
+
/**
* Test that generator obeys the property "generate.max.per.host" and
* "generate.max.per.host.by.ip".
@@ -189,12 +191,9 @@
public void testGenerateHostIPLimit() throws Exception{
ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
- list.add(createURLCrawlDatum("http://www.example.com/index.html",
- 1, 1));
- list.add(createURLCrawlDatum("http://www.example.net/index.html",
- 1, 1));
- list.add(createURLCrawlDatum("http://www.example.org/index.html",
- 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1));
createCrawlDB(list);
@@ -202,7 +201,8 @@
myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1);
myConfiguration.setBoolean(Generator.GENERATE_MAX_PER_HOST_BY_IP, true);
- Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+ Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+ myConfiguration, false);
Path fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -214,7 +214,7 @@
myConfiguration = new Configuration(myConfiguration);
myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2);
- generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -226,7 +226,8 @@
myConfiguration = new Configuration(myConfiguration);
myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3);
- generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration);
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+ false);
fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-00000");
@@ -237,6 +238,47 @@
assertEquals(3, fetchList.size());
}
+ /**
+ * Test generator obeys the filter setting.
+ * @throws Exception
+ * @throws IOException
+ */
+ public void testFilter() throws IOException, Exception{
+
+ ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+ list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1));
+
+ createCrawlDB(list);
+
+ Configuration myConfiguration = new Configuration(conf);
+ myConfiguration.set("urlfilter.suffix.file", "filter-all.txt");
+
+ Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+ myConfiguration, true);
+
+ Path fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+ // verify all got filtered out
+ assertEquals(0, fetchList.size());
+
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
+
+ fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ fetchList = readContents(fetchlistPath);
+
+ // verify nothing got filtered
+ assertEquals(list.size(), fetchList.size());
+
+ }
+
/**
* Read contents of fetchlist.
@@ -270,11 +312,12 @@
* @return path to generated segment
* @throws IOException
*/
- private Path generateFetchlist(int numResults, Configuration config) throws IOException {
+ private Path generateFetchlist(int numResults, Configuration config,
+ boolean filter) throws IOException {
// generate segment
Generator g = new Generator(config);
Path generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
- Long.MAX_VALUE);
+ Long.MAX_VALUE, filter);
return generatedSegment;
}
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?view=diff&rev=476879&r1=476878&r2=476879
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Sun Nov 19 10:48:39 2006
@@ -87,7 +87,8 @@
//generate
Generator g=new Generator(conf);
- Path generatedSegment=g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE);
+ Path generatedSegment = g.generate(crawldbPath, segmentsPath, 1,
+ Long.MAX_VALUE, Long.MAX_VALUE, false);
long time=System.currentTimeMillis();
//fetch