You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/12/28 01:03:05 UTC
svn commit: r490607 - in /lucene/nutch/trunk: ./ conf/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/segment/ src/java/org/apache/nut...
Author: ab
Date: Wed Dec 27 16:03:04 2006
New Revision: 490607
URL: http://svn.apache.org/viewvc?view=rev&rev=490607
Log:
This patch addresses several issues:
* NUTCH-415 - Generator should mark selected records in CrawlDb.
Due to increased resource consumption this step is optional.
Application-level locking has been added to prevent concurrent
modification of databases.
* NUTCH-416 - CrawlDatum status and CrawlDbReducer refactoring. It is
now possible to correctly update CrawlDb from multiple segments.
Introduce new status codes for temporary and permanent
redirection.
* NUTCH-322 - Fix Fetcher to store redirected pages and to store
protocol-level status. This also should fix NUTCH-273.
* Change default Fetcher behavior not to follow redirects immediately.
Instead Fetcher will record redirects as new pages to be added to CrawlDb.
This also partially addresses NUTCH-273.
* Detect and report when Generator creates 0-sized segments.
* Fix Injector to preserve already existing CrawlDatum if the seed list
being injected also contains such URL.
This development was partially supported by SiteSell Inc.
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java (with props)
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Dec 27 16:03:04 2006
@@ -92,6 +92,29 @@
30. NUTCH-406 - Metadata tries to write null values (mattmann)
+31. NUTCH-415 - Generator should mark selected records in CrawlDb.
+ Due to increased resource consumption this step is optional.
+ Application-level locking has been added to prevent concurrent
+ modification of databases. (ab)
+
+32. NUTCH-416 - CrawlDatum status and CrawlDbReducer refactoring. It is
+ now possible to correctly update CrawlDb from multiple segments.
+ Introduce new status codes for temporary and permanent
+ redirection. (ab)
+
+33. NUTCH-322 - Fix Fetcher to store redirected pages and to store
+ protocol-level status. This also should fix NUTCH-273. (ab)
+
+34. Change default Fetcher behavior not to follow redirects immediately.
+ Instead Fetcher will record redirects as new pages to be added to CrawlDb.
+ This also partially addresses NUTCH-273. (ab)
+
+35. Detect and report when Generator creates 0-sized segments. (ab)
+
+36. Fix Injector to preserve already existing CrawlDatum if the seed list
+ being injected also contains such URL. (ab)
+
+
Release 0.8 - 2006-07-25
0. Totally new architecture, based on hadoop
Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed Dec 27 16:03:04 2006
@@ -147,9 +147,11 @@
<property>
<name>http.redirect.max</name>
- <value>3</value>
+ <value>0</value>
<description>The maximum number of redirects the fetcher will follow when
- trying to fetch a page.</description>
+ trying to fetch a page. If set to negative or 0, fetcher won't immediately
+ follow redirected URLs, instead it will record them for later fetching.
+ </description>
</property>
<property>
@@ -377,6 +379,17 @@
remote DNS servers, not to mention increased external traffic
and latency. For these reasons when using this option it is
required that a local caching DNS be used.</description>
+</property>
+
+<property>
+ <name>generate.update.crawldb</name>
+ <value>false</value>
+ <description>For highly-concurrent environments, where several
+ generate/fetch/update cycles may overlap, setting this to true ensures
+ that generate will create different fetchlists even without intervening
+ updatedb-s, at the cost of running an additional job to update CrawlDB.
+ If false, running generate twice without intervening
+ updatedb will generate identical fetchlists.</description>
</property>
<!-- fetcher properties -->
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Wed Dec 27 16:03:04 2006
@@ -116,15 +116,15 @@
for (int i = 0; i < depth; i++) { // generate new segment
Path segment = generator.generate(crawlDb, segments, -1, topN, System
- .currentTimeMillis(), false);
+ .currentTimeMillis(), false, false);
fetcher.fetch(segment, threads); // fetch it
if (!Fetcher.isParsing(job)) {
parseSegment.parse(segment); // parse it, if needed
}
- crawlDbTool.update(crawlDb, segment, true, true); // update crawldb
+ crawlDbTool.update(crawlDb, new Path[]{segment}, true, true); // update crawldb
}
- linkDbTool.invert(linkDb, segments, true, true); // invert links
+ linkDbTool.invert(linkDb, segments, true, true, false); // invert links
// index, dedup & merge
indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments));
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Dec 27 16:03:04 2006
@@ -25,34 +25,86 @@
/* The crawl state of a url. */
public class CrawlDatum implements WritableComparable, Cloneable {
- public static final String DB_DIR_NAME = "current";
-
public static final String GENERATE_DIR_NAME = "crawl_generate";
public static final String FETCH_DIR_NAME = "crawl_fetch";
public static final String PARSE_DIR_NAME = "crawl_parse";
- private final static byte CUR_VERSION = 4;
-
- public static final byte STATUS_SIGNATURE = 0;
- public static final byte STATUS_DB_UNFETCHED = 1;
- public static final byte STATUS_DB_FETCHED = 2;
- public static final byte STATUS_DB_GONE = 3;
- public static final byte STATUS_LINKED = 4;
- public static final byte STATUS_FETCH_SUCCESS = 5;
- public static final byte STATUS_FETCH_RETRY = 6;
- public static final byte STATUS_FETCH_GONE = 7;
-
- public static final String[] statNames = {
- "signature",
- "DB_unfetched",
- "DB_fetched",
- "DB_gone",
- "linked",
- "fetch_success",
- "fetch_retry",
- "fetch_gone"
- };
+ private final static byte CUR_VERSION = 5;
+ /** Compatibility values for on-the-fly conversion from versions < 5. */
+ private static final byte OLD_STATUS_SIGNATURE = 0;
+ private static final byte OLD_STATUS_DB_UNFETCHED = 1;
+ private static final byte OLD_STATUS_DB_FETCHED = 2;
+ private static final byte OLD_STATUS_DB_GONE = 3;
+ private static final byte OLD_STATUS_LINKED = 4;
+ private static final byte OLD_STATUS_FETCH_SUCCESS = 5;
+ private static final byte OLD_STATUS_FETCH_RETRY = 6;
+ private static final byte OLD_STATUS_FETCH_GONE = 7;
+
+ private static HashMap<Byte, Byte> oldToNew = new HashMap<Byte, Byte>();
+
+ /** Page was not fetched yet. */
+ public static final byte STATUS_DB_UNFETCHED = 0x01;
+ /** Page was successfully fetched. */
+ public static final byte STATUS_DB_FETCHED = 0x02;
+ /** Page no longer exists. */
+ public static final byte STATUS_DB_GONE = 0x03;
+ /** Page temporarily redirects to other page. */
+ public static final byte STATUS_DB_REDIR_TEMP = 0x04;
+ /** Page permanently redirects to other page. */
+ public static final byte STATUS_DB_REDIR_PERM = 0x05;
+
+ /** Maximum value of DB-related status. */
+ public static final byte STATUS_DB_MAX = 0x1f;
+
+ /** Fetching was successful. */
+ public static final byte STATUS_FETCH_SUCCESS = 0x21;
+ /** Fetching unsuccessful, needs to be retried (transient errors). */
+ public static final byte STATUS_FETCH_RETRY = 0x22;
+ /** Fetching temporarily redirected to other page. */
+ public static final byte STATUS_FETCH_REDIR_TEMP = 0x23;
+ /** Fetching permanently redirected to other page. */
+ public static final byte STATUS_FETCH_REDIR_PERM = 0x24;
+ /** Fetching unsuccessful - page is gone. */
+ public static final byte STATUS_FETCH_GONE = 0x25;
+
+ /** Maximum value of fetch-related status. */
+ public static final byte STATUS_FETCH_MAX = 0x3f;
+
+ /** Page signature. */
+ public static final byte STATUS_SIGNATURE = 0x41;
+ /** Page was newly injected. */
+ public static final byte STATUS_INJECTED = 0x42;
+ /** Page discovered through a link. */
+ public static final byte STATUS_LINKED = 0x43;
+
+
+ public static final HashMap<Byte, String> statNames = new HashMap<Byte, String>();
+ static {
+ statNames.put(STATUS_DB_UNFETCHED, "db_unfetched");
+ statNames.put(STATUS_DB_FETCHED, "db_fetched");
+ statNames.put(STATUS_DB_GONE, "db_gone");
+ statNames.put(STATUS_DB_REDIR_TEMP, "db_redir_temp");
+ statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm");
+ statNames.put(STATUS_SIGNATURE, "signature");
+ statNames.put(STATUS_INJECTED, "injected");
+ statNames.put(STATUS_LINKED, "linked");
+ statNames.put(STATUS_FETCH_SUCCESS, "fetch_success");
+ statNames.put(STATUS_FETCH_RETRY, "fetch_retry");
+ statNames.put(STATUS_FETCH_REDIR_TEMP, "fetch_redir_temp");
+ statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm");
+ statNames.put(STATUS_FETCH_GONE, "fetch_gone");
+
+ oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
+ oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);
+ oldToNew.put(OLD_STATUS_DB_GONE, STATUS_DB_GONE);
+ oldToNew.put(OLD_STATUS_FETCH_GONE, STATUS_FETCH_GONE);
+ oldToNew.put(OLD_STATUS_FETCH_SUCCESS, STATUS_FETCH_SUCCESS);
+ oldToNew.put(OLD_STATUS_FETCH_RETRY, STATUS_FETCH_RETRY);
+ oldToNew.put(OLD_STATUS_LINKED, STATUS_LINKED);
+ oldToNew.put(OLD_STATUS_SIGNATURE, STATUS_SIGNATURE);
+ }
+
private static final float MILLISECONDS_PER_DAY = 24 * 60 * 60 * 1000;
private byte status;
@@ -63,6 +115,16 @@
private byte[] signature = null;
private long modifiedTime;
private MapWritable metaData;
+
+ public static boolean hasDbStatus(CrawlDatum datum) {
+ if (datum.status <= STATUS_DB_MAX) return true;
+ return false;
+ }
+
+ public static boolean hasFetchStatus(CrawlDatum datum) {
+ if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX) return true;
+ return false;
+ }
public CrawlDatum() {}
@@ -81,6 +143,13 @@
//
public byte getStatus() { return status; }
+
+ public static String getStatusName(byte value) {
+ String res = statNames.get(value);
+ if (res == null) res = "unknown";
+ return res;
+ }
+
public void setStatus(int status) { this.status = (byte)status; }
public long getFetchTime() { return fetchTime; }
@@ -174,6 +243,14 @@
}
}
}
+ // translate status codes
+ if (version < 5) {
+ if (oldToNew.containsKey(status))
+ status = oldToNew.get(status);
+ else
+ status = STATUS_DB_UNFETCHED;
+
+ }
}
/** The number of bytes into a CrawlDatum that the score is stored. */
@@ -285,7 +362,7 @@
public String toString() {
StringBuffer buf = new StringBuffer();
buf.append("Version: " + CUR_VERSION + "\n");
- buf.append("Status: " + getStatus() + " (" + statNames[getStatus()] + ")\n");
+ buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) + ")\n");
buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
buf.append("Modified time: " + new Date(getModifiedTime()) + "\n");
buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Wed Dec 27 16:03:04 2006
@@ -31,6 +31,7 @@
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolBase;
+import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -39,9 +40,14 @@
* crawldb accordingly.
*/
public class CrawlDb extends ToolBase {
+ public static final Log LOG = LogFactory.getLog(CrawlDb.class);
+
public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
- public static final Log LOG = LogFactory.getLog(CrawlDb.class);
+ public static final String CURRENT_NAME = "current";
+
+ public static final String LOCK_NAME = ".locked";
+
public CrawlDb() {
@@ -51,17 +57,19 @@
setConf(conf);
}
- public void update(Path crawlDb, Path segment, boolean normalize, boolean filter) throws IOException {
+ public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter) throws IOException {
boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
- update(crawlDb, segment, normalize, filter, additionsAllowed);
+ update(crawlDb, segments, normalize, filter, additionsAllowed, false);
}
- public void update(Path crawlDb, Path segment, boolean normalize, boolean filter, boolean additionsAllowed) throws IOException {
-
+ public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException {
+ FileSystem fs = FileSystem.get(getConf());
+ Path lock = new Path(crawlDb, LOCK_NAME);
+ LockUtil.createLockFile(fs, lock, force);
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb update: starting");
LOG.info("CrawlDb update: db: " + crawlDb);
- LOG.info("CrawlDb update: segment: " + segment);
+ LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
LOG.info("CrawlDb update: URL normalizing: " + normalize);
LOG.info("CrawlDb update: URL filtering: " + filter);
@@ -71,13 +79,27 @@
job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
- job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME));
- job.addInputPath(new Path(segment, CrawlDatum.PARSE_DIR_NAME));
+ for (int i = 0; i < segments.length; i++) {
+ Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
+ Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
+ if (fs.exists(fetch) && fs.exists(parse)) {
+ job.addInputPath(fetch);
+ job.addInputPath(parse);
+ } else {
+ LOG.info(" - skipping invalid segment " + segments[i]);
+ }
+ }
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb update: Merging segment data into db.");
}
- JobClient.runJob(job);
+ try {
+ JobClient.runJob(job);
+ } catch (IOException e) {
+ LockUtil.removeLockFile(fs, lock);
+ if (fs.exists(job.getOutputPath())) fs.delete(job.getOutputPath());
+ throw e;
+ }
CrawlDb.install(job, crawlDb);
if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: done"); }
@@ -93,7 +115,7 @@
job.setJobName("crawldb " + crawlDb);
- Path current = new Path(crawlDb, CrawlDatum.DB_DIR_NAME);
+ Path current = new Path(crawlDb, CURRENT_NAME);
if (FileSystem.get(job).exists(current)) {
job.addInputPath(current);
}
@@ -114,7 +136,7 @@
Path newCrawlDb = job.getOutputPath();
FileSystem fs = new JobClient(job).getFs();
Path old = new Path(crawlDb, "old");
- Path current = new Path(crawlDb, CrawlDatum.DB_DIR_NAME);
+ Path current = new Path(crawlDb, CURRENT_NAME);
if (fs.exists(current)) {
if (fs.exists(old)) fs.delete(old);
fs.rename(current, old);
@@ -122,6 +144,8 @@
fs.mkdirs(crawlDb);
fs.rename(newCrawlDb, current);
if (fs.exists(old)) fs.delete(old);
+ Path lock = new Path(crawlDb, LOCK_NAME);
+ LockUtil.removeLockFile(fs, lock);
}
public static void main(String[] args) throws Exception {
@@ -131,9 +155,11 @@
public int run(String[] args) throws Exception {
if (args.length < 2) {
- System.err.println("Usage: CrawlDb <crawldb> <segment> [-normalize] [-filter] [-noAdditions]");
+ System.err.println("Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-normalize] [-filter] [-noAdditions]");
System.err.println("\tcrawldb\tCrawlDb to update");
- System.err.println("\tsegment\tsegment name to update from");
+ System.err.println("\t-dir segments\tparent directory containing all segments to update from");
+ System.err.println("\tseg1 seg2 ...\tlist of segment names to update from");
+ System.err.println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
System.err.println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)");
System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and segment");
System.err.println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs");
@@ -141,20 +167,36 @@
}
boolean normalize = false;
boolean filter = false;
+ boolean force = false;
+ final FileSystem fs = FileSystem.get(getConf());
boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
- if (args.length > 2) {
- for (int i = 2; i < args.length; i++) {
- if (args[i].equals("-normalize")) {
- normalize = true;
- } else if (args[i].equals("-filter")) {
- filter = true;
- } else if (args[i].equals("-noAdditions")) {
- additionsAllowed = false;
- }
+ HashSet<Path> dirs = new HashSet<Path>();
+ for (int i = 1; i < args.length; i++) {
+ if (args[i].equals("-normalize")) {
+ normalize = true;
+ } else if (args[i].equals("-filter")) {
+ filter = true;
+ } else if (args[i].equals("-force")) {
+ force = true;
+ } else if (args[i].equals("-noAdditions")) {
+ additionsAllowed = false;
+ } else if (args[i].equals("-dir")) {
+ Path[] paths = fs.listPaths(new Path(args[++i]), new PathFilter() {
+ public boolean accept(Path dir) {
+ try {
+ return fs.isDirectory(dir);
+ } catch (IOException ioe) {
+ return false;
+ }
+ }
+ });
+ dirs.addAll(Arrays.asList(paths));
+ } else {
+ dirs.add(new Path(args[i]));
}
}
try {
- update(new Path(args[0]), new Path(args[1]), normalize, filter, additionsAllowed);
+ update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), normalize, filter, additionsAllowed, force);
return 0;
} catch (Exception e) {
LOG.fatal("CrawlDb update: " + StringUtils.stringifyException(e));
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Wed Dec 27 16:03:04 2006
@@ -105,12 +105,12 @@
public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception {
JobConf job = createMergeJob(getConf(), output, normalize, filter);
for (int i = 0; i < dbs.length; i++) {
- job.addInputPath(new Path(dbs[i], CrawlDatum.DB_DIR_NAME));
+ job.addInputPath(new Path(dbs[i], CrawlDb.CURRENT_NAME));
}
JobClient.runJob(job);
FileSystem fs = FileSystem.get(getConf());
fs.mkdirs(output);
- fs.rename(job.getOutputPath(), new Path(output, CrawlDatum.DB_DIR_NAME));
+ fs.rename(job.getOutputPath(), new Path(output, CrawlDb.CURRENT_NAME));
}
public static JobConf createMergeJob(Configuration conf, Path output, boolean normalize, boolean filter) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Dec 27 16:03:04 2006
@@ -68,7 +68,7 @@
private void openReaders(String crawlDb, Configuration config) throws IOException {
if (readers != null) return;
FileSystem fs = FileSystem.get(config);
- readers = MapFileOutputFormat.getReaders(fs, new Path(crawlDb, CrawlDatum.DB_DIR_NAME), config);
+ readers = MapFileOutputFormat.getReaders(fs, new Path(crawlDb, CrawlDb.CURRENT_NAME), config);
}
private void closeReaders() {
@@ -243,7 +243,7 @@
JobConf job = new NutchJob(config);
job.setJobName("stats " + crawlDb);
- job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
+ job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(CrawlDbStatMapper.class);
@@ -301,10 +301,10 @@
} else if (k.equals("scx")) {
LOG.info("max score:\t" + (float) (val.get() / 1000.0f));
} else if (k.equals("sct")) {
- LOG.info("avg score:\t" + (float) ((float) (val.get() / totalCnt.get()) / 1000.0f));
+ LOG.info("avg score:\t" + (float) ((double) (val.get() / totalCnt.get()) / 1000.0));
} else if (k.startsWith("status")) {
int code = Integer.parseInt(k.substring(k.indexOf(' ') + 1));
- LOG.info(k + " (" + CrawlDatum.statNames[code] + "):\t" + val);
+ LOG.info(k + " (" + CrawlDatum.getStatusName((byte)code) + "):\t" + val);
} else LOG.info(k + ":\t" + val);
}
}
@@ -344,7 +344,7 @@
JobConf job = new NutchJob(config);
job.setJobName("dump " + crawlDb);
- job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
+ job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
job.setInputKeyClass(Text.class);
job.setInputValueClass(CrawlDatum.class);
@@ -373,10 +373,8 @@
JobConf job = new NutchJob(config);
job.setJobName("topN prepare " + crawlDb);
- job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
+ job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(Text.class);
- job.setInputValueClass(CrawlDatum.class);
job.setMapperClass(CrawlDbTopNMapper.class);
job.setReducerClass(IdentityReducer.class);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Wed Dec 27 16:03:04 2006
@@ -27,12 +27,14 @@
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
/** Merge new page entries with existing entries. */
public class CrawlDbReducer implements Reducer {
public static final Log LOG = LogFactory.getLog(CrawlDbReducer.class);
+
private int retryMax;
private CrawlDatum result = new CrawlDatum();
private ArrayList linked = new ArrayList();
@@ -51,60 +53,81 @@
OutputCollector output, Reporter reporter)
throws IOException {
- CrawlDatum highest = null;
+ CrawlDatum fetch = null;
CrawlDatum old = null;
byte[] signature = null;
linked.clear();
while (values.hasNext()) {
CrawlDatum datum = (CrawlDatum)values.next();
+ if (CrawlDatum.hasDbStatus(datum)) {
+ if (old == null) {
+ old = datum;
+ } else {
+ // always take the latest version
+ if (old.getFetchTime() < datum.getFetchTime()) old = datum;
+ }
+ continue;
+ }
- if (highest == null || datum.getStatus() > highest.getStatus()) {
- highest = datum; // find highest status
+ if (CrawlDatum.hasFetchStatus(datum)) {
+ if (fetch == null) {
+ fetch = datum;
+ } else {
+ // always take the latest version
+ if (fetch.getFetchTime() < datum.getFetchTime()) fetch = datum;
+ }
+ continue;
}
- switch (datum.getStatus()) { // find old entry, if any
- case CrawlDatum.STATUS_DB_UNFETCHED:
- case CrawlDatum.STATUS_DB_FETCHED:
- case CrawlDatum.STATUS_DB_GONE:
- old = datum;
- break;
+ switch (datum.getStatus()) { // collect other info
case CrawlDatum.STATUS_LINKED:
linked.add(datum);
break;
case CrawlDatum.STATUS_SIGNATURE:
signature = datum.getSignature();
+ break;
+ default:
+ LOG.warn("Unknown status, key: " + key + ", datum: " + datum);
}
}
// if it doesn't already exist, skip it
if (old == null && !additionsAllowed) return;
- // initialize with the latest version
- result.set(highest);
+ // if there is no fetched datum, perhaps there is a link
+ if (fetch == null && linked.size() > 0) {
+ fetch = (CrawlDatum)linked.get(0);
+ }
+
+ // still no new data - record only unchanged old data, if exists, and return
+ if (fetch == null) {
+ if (old != null) // at this point at least "old" should be present
+ output.collect(key, old);
+ else
+ LOG.warn("Missing fetch and old value, signature=" + signature);
+ return;
+ }
+
+ // initialize with the latest version, be it fetch or link
+ result.set(fetch);
if (old != null) {
// copy metadata from old, if exists
if (old.getMetaData().size() > 0) {
result.getMetaData().putAll(old.getMetaData());
// overlay with new, if any
- if (highest.getMetaData().size() > 0)
- result.getMetaData().putAll(highest.getMetaData());
+ if (fetch.getMetaData().size() > 0)
+ result.getMetaData().putAll(fetch.getMetaData());
}
// set the most recent valid value of modifiedTime
- if (old.getModifiedTime() > 0 && highest.getModifiedTime() == 0) {
+ if (old.getModifiedTime() > 0 && fetch.getModifiedTime() == 0) {
result.setModifiedTime(old.getModifiedTime());
}
}
+
+ switch (fetch.getStatus()) { // determine new status
- switch (highest.getStatus()) { // determine new status
-
- case CrawlDatum.STATUS_DB_UNFETCHED: // no new entry
- case CrawlDatum.STATUS_DB_FETCHED:
- case CrawlDatum.STATUS_DB_GONE:
- result.set(old); // use old
- break;
-
- case CrawlDatum.STATUS_LINKED: // highest was link
+ case CrawlDatum.STATUS_LINKED: // it was link
if (old != null) { // if old exists
result.set(old); // use it
} else {
@@ -122,11 +145,21 @@
break;
case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch
- if (highest.getSignature() == null) result.setSignature(signature);
+ if (fetch.getSignature() == null) result.setSignature(signature);
result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
result.setNextFetchTime();
break;
+ case CrawlDatum.STATUS_FETCH_REDIR_TEMP:
+ if (fetch.getSignature() == null) result.setSignature(signature);
+ result.setStatus(CrawlDatum.STATUS_DB_REDIR_TEMP);
+ result.setNextFetchTime();
+ break;
+ case CrawlDatum.STATUS_FETCH_REDIR_PERM:
+ if (fetch.getSignature() == null) result.setSignature(signature);
+ result.setStatus(CrawlDatum.STATUS_DB_REDIR_PERM);
+ result.setNextFetchTime();
+ break;
case CrawlDatum.STATUS_SIGNATURE:
if (LOG.isWarnEnabled()) {
LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + key);
@@ -135,7 +168,7 @@
case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure
if (old != null)
result.setSignature(old.getSignature()); // use old signature
- if (highest.getRetriesSinceFetch() < retryMax) {
+ if (fetch.getRetriesSinceFetch() < retryMax) {
result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
} else {
result.setStatus(CrawlDatum.STATUS_DB_GONE);
@@ -149,7 +182,7 @@
break;
default:
- throw new RuntimeException("Unknown status: " + highest.getStatus() + " " + key);
+ throw new RuntimeException("Unknown status: " + fetch.getStatus() + " " + key);
}
try {
@@ -159,6 +192,8 @@
LOG.warn("Couldn't update score, key=" + key + ": " + e);
}
}
+ // remove generation time, if any
+ result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
output.collect(key, result);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Dec 27 16:03:04 2006
@@ -31,13 +31,16 @@
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -47,8 +50,10 @@
public static final String CRAWL_GENERATE_FILTER = "crawl.generate.filter";
public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip";
public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host";
+ public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb";
public static final String CRAWL_TOP_N = "crawl.topN";
public static final String CRAWL_GEN_CUR_TIME = "crawl.gen.curTime";
+ public static final String CRAWL_GEN_DELAY = "crawl.gen.delay";
public static final Log LOG = LogFactory.getLog(Generator.class);
public static class SelectorEntry implements Writable {
@@ -77,6 +82,7 @@
/** Selects entries due for fetch. */
public static class Selector implements Mapper, Partitioner, Reducer {
+ private LongWritable genTime = new LongWritable(System.currentTimeMillis());
private long curTime;
private long limit;
private long count;
@@ -91,6 +97,8 @@
private boolean byIP;
private long dnsFailure = 0L;
private boolean filter;
+ private long genDelay;
+ private boolean runUpdatedb;
public void configure(JobConf job) {
curTime = job.getLong(CRAWL_GEN_CUR_TIME, System.currentTimeMillis());
@@ -102,6 +110,10 @@
scfilters = new ScoringFilters(job);
hostPartitioner.configure(job);
filter = job.getBoolean(CRAWL_GENERATE_FILTER, true);
+ genDelay = job.getLong(CRAWL_GEN_DELAY, 7L) * 3600L * 24L * 1000L;
+ long time = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
+ if (time > 0) genTime.set(time);
+ runUpdatedb = job.getBoolean(GENERATE_UPDATE_CRAWLDB, false);
}
public void close() {}
@@ -125,12 +137,18 @@
}
CrawlDatum crawlDatum = (CrawlDatum)value;
- if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_GONE)
+ if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_GONE ||
+ crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM)
return; // don't retry
if (crawlDatum.getFetchTime() > curTime)
return; // not time yet
+ LongWritable oldGenTime = (LongWritable)crawlDatum.getMetaData().get(Nutch.WRITABLE_GENERATE_TIME_KEY);
+ if (oldGenTime != null) { // awaiting fetch & update
+ if (oldGenTime.get() + genDelay > curTime) // still wait for update
+ return;
+ }
float sort = 1.0f;
try {
sort = scfilters.generatorSortValue((Text)key, crawlDatum, sort);
@@ -141,6 +159,8 @@
}
// sort by decreasing score, using DecreasingFloatComparator
sortValue.set(sort);
+ // record generation time
+ crawlDatum.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime);
entry.datum = crawlDatum;
entry.url = (Text)key;
output.collect(sortValue, entry); // invert for sort by score
@@ -247,7 +267,7 @@
public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
SelectorEntry entry = (SelectorEntry)value;
output.collect(entry.url, entry.datum);
- }
+ }
}
/** Sort fetch lists by hash of URL. */
@@ -286,6 +306,38 @@
}
}
+ /**
+ * Update the CrawlDB so that the next generate won't include the same URLs.
+ */
+ public static class CrawlDbUpdater extends MapReduceBase implements Mapper, Reducer {
+ public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
+ if (key instanceof FloatWritable) { // tempDir source
+ SelectorEntry se = (SelectorEntry)value;
+ output.collect(se.url, se.datum);
+ } else {
+ output.collect(key, value);
+ }
+ }
+
+ public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
+ CrawlDatum orig = null;
+ LongWritable genTime = null;
+ while (values.hasNext()) {
+ CrawlDatum val = (CrawlDatum)values.next();
+ if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) {
+ genTime = (LongWritable)val.getMetaData().get(Nutch.WRITABLE_GENERATE_TIME_KEY);
+ } else {
+ orig = val;
+ }
+ }
+ if (genTime != null) {
+ orig.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime);
+ }
+ output.collect(key, orig);
+ }
+
+ }
+
public Generator() {
}
@@ -298,21 +350,25 @@
public Path generate(Path dbDir, Path segments)
throws IOException {
return generate(dbDir, segments, -1, Long.MAX_VALUE, System
- .currentTimeMillis(), true);
+ .currentTimeMillis(), true, false);
}
/** Generate fetchlists in a segment. */
public Path generate(Path dbDir, Path segments,
- int numLists, long topN, long curTime, boolean filter)
+ int numLists, long topN, long curTime, boolean filter,
+ boolean force)
throws IOException {
Path tempDir =
new Path(getConf().get("mapred.temp.dir", ".") +
- "/generate-temp-"+
- Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+ "/generate-temp-"+ System.currentTimeMillis());
Path segment = new Path(segments, generateSegmentName());
Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
+
+ Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
+ FileSystem fs = FileSystem.get(getConf());
+ LockUtil.createLockFile(fs, lock, force);
LOG.info("Generator: Selecting best-scoring urls due for fetch.");
LOG.info("Generator: starting");
@@ -322,7 +378,7 @@
LOG.info("Generator: topN: " + topN);
}
- // map to inverted subset due for fetch, sort by link count
+ // map to inverted subset due for fetch, sort by score
JobConf job = new NutchJob(getConf());
job.setJobName("generate: select " + segment);
@@ -335,10 +391,12 @@
numLists = 1;
}
job.setLong(CRAWL_GEN_CUR_TIME, curTime);
+ // record real generation time
+ job.setLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
job.setLong(CRAWL_TOP_N, topN);
job.setBoolean(CRAWL_GENERATE_FILTER, filter);
- job.setInputPath(new Path(dbDir, CrawlDatum.DB_DIR_NAME));
+ job.setInputPath(new Path(dbDir, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(Selector.class);
@@ -350,7 +408,22 @@
job.setOutputKeyClass(FloatWritable.class);
job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
job.setOutputValueClass(SelectorEntry.class);
- JobClient.runJob(job);
+ try {
+ JobClient.runJob(job);
+ } catch (IOException e) {
+ LockUtil.removeLockFile(fs, lock);
+ throw e;
+ }
+
+ // check that we selected at least some entries ...
+ SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(job, tempDir);
+ if (readers == null || readers.length == 0 || !readers[0].next(new FloatWritable())) {
+ LOG.warn("Generator: 0 records selected for fetching, exiting ...");
+ LockUtil.removeLockFile(fs, lock);
+ fs.delete(tempDir);
+ return null;
+ }
+ for (int i = 0; i < readers.length; i++) readers[i].close();
// invert again, paritition by host, sort by url hash
if (LOG.isInfoEnabled()) {
@@ -373,9 +446,43 @@
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
job.setOutputKeyComparatorClass(HashComparator.class);
- JobClient.runJob(job);
-
- new JobClient(getConf()).getFs().delete(tempDir);
+ try {
+ JobClient.runJob(job);
+ } catch (IOException e) {
+ LockUtil.removeLockFile(fs, lock);
+ fs.delete(tempDir);
+ throw e;
+ }
+ if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
+ // update the db from tempDir
+ Path tempDir2 =
+ new Path(getConf().get("mapred.temp.dir", ".") +
+ "/generate-temp-"+ System.currentTimeMillis());
+
+ job = new NutchJob(getConf());
+ job.setJobName("generate: updatedb " + dbDir);
+ job.addInputPath(tempDir);
+ job.addInputPath(new Path(dbDir, CrawlDb.CURRENT_NAME));
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setMapperClass(CrawlDbUpdater.class);
+ job.setReducerClass(CrawlDbUpdater.class);
+ job.setOutputFormat(MapFileOutputFormat.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(CrawlDatum.class);
+ job.setOutputPath(tempDir2);
+ try {
+ JobClient.runJob(job);
+ CrawlDb.install(job, dbDir);
+ } catch (IOException e) {
+ LockUtil.removeLockFile(fs, lock);
+ fs.delete(tempDir);
+ fs.delete(tempDir2);
+ throw e;
+ }
+ fs.delete(tempDir2);
+ }
+ LockUtil.removeLockFile(fs, lock);
+ fs.delete(tempDir);
if (LOG.isInfoEnabled()) { LOG.info("Generator: done."); }
@@ -402,7 +509,7 @@
public int run(String[] args) throws Exception {
if (args.length < 2) {
- System.out.println("Usage: Generator <crawldb> <segments_dir> [-topN N] [-numFetchers numFetchers] [-adddays numDays] [-noFilter]");
+ System.out.println("Usage: Generator <crawldb> <segments_dir> [-force] [-topN N] [-numFetchers numFetchers] [-adddays numDays] [-noFilter]");
return -1;
}
@@ -412,6 +519,7 @@
long topN = Long.MAX_VALUE;
int numFetchers = -1;
boolean filter = true;
+ boolean force = false;
for (int i = 2; i < args.length; i++) {
if ("-topN".equals(args[i])) {
@@ -425,13 +533,16 @@
curTime += numDays * 1000L * 60 * 60 * 24;
} else if ("-noFilter".equals(args[i])) {
filter = false;
+ } else if ("-force".equals(args[i])) {
+ force = true;
}
}
try {
- generate(dbDir, segmentsDir, numFetchers, topN, curTime, filter);
- return 0;
+ Path seg = generate(dbDir, segmentsDir, numFetchers, topN, curTime, filter, force);
+ if (seg == null) return -2;
+ else return 0;
} catch (Exception e) {
LOG.fatal("Generator: " + StringUtils.stringifyException(e));
return -1;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Wed Dec 27 16:03:04 2006
@@ -78,7 +78,7 @@
}
if (url != null) { // if it passes
value.set(url); // collect it
- CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, interval);
+ CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, interval);
datum.setScore(scoreInjected);
try {
scfilters.injectedScore(value, datum);
@@ -102,7 +102,22 @@
public void reduce(WritableComparable key, Iterator values,
OutputCollector output, Reporter reporter)
throws IOException {
- output.collect(key, (Writable)values.next()); // just collect first value
+ CrawlDatum old = null;
+ CrawlDatum injected = null;
+ while (values.hasNext()) {
+ CrawlDatum val = (CrawlDatum)values.next();
+ if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
+ injected = val;
+ injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+ } else {
+ old = val;
+ }
+ }
+ CrawlDatum res = null;
+ if (old != null) res = old; // don't overwrite existing value
+ else res = injected;
+
+ output.collect(key, res);
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Wed Dec 27 16:03:04 2006
@@ -36,6 +36,7 @@
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.*;
+import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -44,7 +45,8 @@
public static final Log LOG = LogFactory.getLog(LinkDb.class);
- public static String CURRENT_NAME = "current";
+ public static final String CURRENT_NAME = "current";
+ public static final String LOCK_NAME = ".locked";
private int maxAnchorLength;
private int maxInlinks;
@@ -178,20 +180,11 @@
OutputCollector output, Reporter reporter)
throws IOException {
- Inlinks result = null;
+ Inlinks result = new Inlinks();
while (values.hasNext()) {
Inlinks inlinks = (Inlinks)values.next();
- if (result == null) { // optimize a common case
- if (inlinks.size() < maxInlinks) {
- result = inlinks;
- continue;
- } else {
- result = new Inlinks();
- }
- }
-
int end = Math.min(maxInlinks - result.size(), inlinks.size());
Iterator it = inlinks.iterator();
int i = 0;
@@ -199,10 +192,11 @@
result.add((Inlink)it.next());
}
}
+ if (result.size() == 0) return;
output.collect(key, result);
}
- public void invert(Path linkDb, final Path segmentsDir, boolean normalize, boolean filter) throws IOException {
+ public void invert(Path linkDb, final Path segmentsDir, boolean normalize, boolean filter, boolean force) throws IOException {
final FileSystem fs = FileSystem.get(getConf());
Path[] files = fs.listPaths(segmentsDir, new PathFilter() {
public boolean accept(Path f) {
@@ -212,11 +206,14 @@
return false;
}
});
- invert(linkDb, files, normalize, filter);
+ invert(linkDb, files, normalize, filter, force);
}
- public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter) throws IOException {
+ public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force) throws IOException {
+ Path lock = new Path(linkDb, LOCK_NAME);
+ FileSystem fs = FileSystem.get(getConf());
+ LockUtil.createLockFile(fs, lock, force);
if (LOG.isInfoEnabled()) {
LOG.info("LinkDb: starting");
LOG.info("LinkDb: linkdb: " + linkDb);
@@ -230,8 +227,12 @@
}
job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
}
- JobClient.runJob(job);
- FileSystem fs = FileSystem.get(getConf());
+ try {
+ JobClient.runJob(job);
+ } catch (IOException e) {
+ LockUtil.removeLockFile(fs, lock);
+ throw e;
+ }
if (fs.exists(linkDb)) {
if (LOG.isInfoEnabled()) {
LOG.info("LinkDb: merging with existing linkdb: " + linkDb);
@@ -241,7 +242,13 @@
job = LinkDb.createMergeJob(getConf(), linkDb, normalize, filter);
job.addInputPath(new Path(linkDb, CURRENT_NAME));
job.addInputPath(newLinkDb);
- JobClient.runJob(job);
+ try {
+ JobClient.runJob(job);
+ } catch (IOException e) {
+ LockUtil.removeLockFile(fs, lock);
+ fs.delete(newLinkDb);
+ throw e;
+ }
fs.delete(newLinkDb);
}
LinkDb.install(job, linkDb);
@@ -257,8 +264,6 @@
job.setJobName("linkdb " + linkDb);
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(Text.class);
- job.setInputValueClass(ParseData.class);
job.setMapperClass(LinkDb.class);
// if we don't run the mergeJob, perform normalization/filtering now
@@ -293,8 +298,6 @@
job.setJobName("linkdb merge " + linkDb);
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(Text.class);
- job.setInputValueClass(Inlinks.class);
job.setMapperClass(LinkDbFilter.class);
job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
@@ -322,6 +325,7 @@
fs.mkdirs(linkDb);
fs.rename(newLinkDb, current);
if (fs.exists(old)) fs.delete(old);
+ LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME));
}
public static void main(String[] args) throws Exception {
@@ -331,10 +335,11 @@
public int run(String[] args) throws Exception {
if (args.length < 2) {
- System.err.println("Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-noNormalize] [-noFilter]");
+ System.err.println("Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]");
System.err.println("\tlinkdb\toutput LinkDb to create or update");
System.err.println("\t-dir segmentsDir\tparent directory of several segments, OR");
System.err.println("\tseg1 seg2 ...\t list of segment directories");
+ System.err.println("\t-force\tforce update even if LinkDb appears to be locked (CAUTION advised)");
System.err.println("\t-noNormalize\tdon't normalize link URLs");
System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs");
return -1;
@@ -345,6 +350,7 @@
ArrayList segs = new ArrayList();
boolean filter = true;
boolean normalize = true;
+ boolean force = false;
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-dir")) {
segDir = new Path(args[++i]);
@@ -362,10 +368,12 @@
normalize = false;
} else if (args[i].equalsIgnoreCase("-noFilter")) {
filter = false;
+ } else if (args[i].equalsIgnoreCase("-force")) {
+ force = true;
} else segs.add(new Path(args[i]));
}
try {
- invert(db, (Path[])segs.toArray(new Path[segs.size()]), normalize, filter);
+ invert(db, (Path[])segs.toArray(new Path[segs.size()]), normalize, filter, force);
return 0;
} catch (Exception e) {
LOG.fatal("LinkDb: " + StringUtils.stringifyException(e));
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java Wed Dec 27 16:03:04 2006
@@ -43,6 +43,7 @@
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.protocol.ProtocolStatus;
/**
* A writable map, with a similar behavior as <code>java.util.HashMap</code>.
@@ -94,6 +95,7 @@
addToMap(FloatWritable.class, new Byte((byte) -117));
addToMap(IntWritable.class, new Byte((byte) -116));
addToMap(ObjectWritable.class, new Byte((byte) -115));
+ addToMap(ProtocolStatus.class, new Byte((byte) -114));
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Dec 27 16:03:04 2006
@@ -149,7 +149,7 @@
switch(status.getCode()) {
case ProtocolStatus.SUCCESS: // got a page
- pstatus = output(url, datum, content, CrawlDatum.STATUS_FETCH_SUCCESS);
+ pstatus = output(url, datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS);
updateStatus(content.getContent().length);
if (pstatus != null && pstatus.isSuccess() &&
pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
@@ -158,10 +158,17 @@
newUrl = this.urlFilters.filter(newUrl);
if (newUrl != null && !newUrl.equals(url.toString())) {
url = new Text(newUrl);
- redirecting = true;
- redirectCount++;
- if (LOG.isDebugEnabled()) {
- LOG.debug(" - content redirect to " + url);
+ if (maxRedirect > 0) {
+ redirecting = true;
+ redirectCount++;
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(" - content redirect to " + url + " (fetching now)");
+ }
+ } else {
+ output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_FETCH_REDIR_TEMP);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(" - content redirect to " + url + " (fetching later)");
+ }
}
} else if (LOG.isDebugEnabled()) {
LOG.debug(" - content redirect skipped: " +
@@ -172,15 +179,29 @@
case ProtocolStatus.MOVED: // redirect
case ProtocolStatus.TEMP_MOVED:
+ int code;
+ if (status.getCode() == ProtocolStatus.MOVED) {
+ code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
+ } else {
+ code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
+ }
+ output(url, datum, content, status, code);
String newUrl = status.getMessage();
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
newUrl = this.urlFilters.filter(newUrl);
if (newUrl != null && !newUrl.equals(url.toString())) {
url = new Text(newUrl);
- redirecting = true;
- redirectCount++;
- if (LOG.isDebugEnabled()) {
- LOG.debug(" - protocol redirect to " + url);
+ if (maxRedirect > 0) {
+ redirecting = true;
+ redirectCount++;
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(" - protocol redirect to " + url + " (fetching now)");
+ }
+ } else {
+ output(url, new CrawlDatum(), null, null, code);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(" - protocol redirect to " + url + " (fetching later)");
+ }
}
} else if (LOG.isDebugEnabled()) {
LOG.debug(" - protocol redirect skipped: " +
@@ -198,7 +219,7 @@
// intermittent blocking - retry without increasing the counter
case ProtocolStatus.WOULDBLOCK:
case ProtocolStatus.BLOCKED:
- output(url, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
+ output(url, datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
break;
// permanent failures
@@ -207,21 +228,21 @@
case ProtocolStatus.ACCESS_DENIED:
case ProtocolStatus.ROBOTS_DENIED:
case ProtocolStatus.NOTMODIFIED:
- output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+ output(url, datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
break;
default:
if (LOG.isWarnEnabled()) {
LOG.warn("Unknown ProtocolStatus: " + status.getCode());
}
- output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+ output(url, datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
}
if (redirecting && redirectCount >= maxRedirect) {
if (LOG.isInfoEnabled()) {
LOG.info(" - redirect count exceeded " + url);
}
- output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+ output(url, datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
}
} while (redirecting && (redirectCount < maxRedirect));
@@ -229,7 +250,7 @@
} catch (Throwable t) { // unexpected exception
logError(url, t.toString());
- output(url, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
+ output(url, datum, null, null, CrawlDatum.STATUS_FETCH_RETRY);
}
}
@@ -254,10 +275,11 @@
}
private ParseStatus output(Text key, CrawlDatum datum,
- Content content, int status) {
+ Content content, ProtocolStatus pstatus, int status) {
datum.setStatus(status);
datum.setFetchTime(System.currentTimeMillis());
+ if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
if (content == null) {
String url = key.toString();
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Wed Dec 27 16:03:04 2006
@@ -41,6 +41,7 @@
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDb;
@@ -286,7 +287,7 @@
job.addInputPath(new Path(segments[i], ParseText.DIR_NAME));
}
- job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
+ job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME));
job.setInputFormat(InputFormat.class);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Wed Dec 27 16:03:04 2006
@@ -16,6 +16,8 @@
*/
package org.apache.nutch.metadata;
+import org.apache.hadoop.io.Text;
+
/**
* A collection of Nutch internal metadata constants.
@@ -36,5 +38,13 @@
public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
public static final String SCORE_KEY = "nutch.crawl.score";
+
+ public static final String GENERATE_TIME_KEY = "_ngt_";
+
+ public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(GENERATE_TIME_KEY);
+
+ public static final String PROTO_STATUS_KEY = "_pst_";
+
+ public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(PROTO_STATUS_KEY);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Wed Dec 27 16:03:04 2006
@@ -95,7 +95,7 @@
final Path segmentDumpFile = new Path(job.getOutputPath(), name);
// Get the old copy out of the way
- fs.delete(segmentDumpFile);
+ if (fs.exists(segmentDumpFile)) fs.delete(segmentDumpFile);
final PrintStream printStream = new PrintStream(fs.create(segmentDumpFile));
return new RecordWriter() {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java Wed Dec 27 16:03:04 2006
@@ -104,13 +104,13 @@
if (args.length == 0) {
System.err.println("Usage: CrawlDbConverter <oldDb> <newDb> [-withMetadata]");
System.err.println("\toldDb\tname of the crawldb that uses UTF8 class.");
- System.err.println("\tnewDb\tname of the crawldb that will use Text class.");
- System.err.println("\twithMetadata\tconvert also all metadata keys using UTF8 to Text.");
+ System.err.println("\tnewDb\tname of the output crawldb that will use Text class.");
+ System.err.println("\twithMetadata\tconvert also all metadata keys that use UTF8 to Text.");
return -1;
}
JobConf job = new NutchJob(getConf());
FileSystem fs = FileSystem.get(getConf());
- Path oldDb = new Path(args[0], CrawlDatum.DB_DIR_NAME);
+ Path oldDb = new Path(args[0], CrawlDb.CURRENT_NAME);
Path newDb =
new Path(oldDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java?view=auto&rev=490607
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java Wed Dec 27 16:03:04 2006
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * Utility methods for handling application-level locking.
+ *
+ * @author Andrzej Bialecki
+ */
+public class LockUtil {
+
+ /**
+ * Create a lock file.
+ * @param fs filesystem
+ * @param lockFile name of the lock file
+ * @param accept if true, and the target file exists, consider it valid. If false
+ * and the target file exists, throw an IOException.
+ * @throws IOException if accept is false, and the target file already exists,
+ * or if it's a directory.
+ */
+ public static void createLockFile(FileSystem fs, Path lockFile, boolean accept) throws IOException {
+ if (fs.exists(lockFile)) {
+ if(!accept)
+ throw new IOException("lock file " + lockFile + " already exists.");
+ if (fs.isDirectory(lockFile))
+ throw new IOException("lock file " + lockFile + " already exists and is a directory.");
+ // do nothing - the file already exists.
+ } else {
+ // make sure parents exist
+ fs.mkdirs(lockFile.getParent());
+ fs.createNewFile(lockFile);
+ }
+ }
+
+ /**
+ * Remove lock file. NOTE: applications enforce the semantics of this file -
+ * this method simply removes any file with a given name.
+ * @param fs filesystem
+ * @param lockFile lock file name
+ * @return false, if the lock file doesn't exist. True, if it existed and was
+ * successfully removed.
+ * @throws IOException if lock file exists but it is a directory.
+ */
+ public static boolean removeLockFile(FileSystem fs, Path lockFile) throws IOException {
+ if (!fs.exists(lockFile)) return false;
+ if (fs.isDirectory(lockFile))
+ throw new IOException("lock file " + lockFile + " exists but is a directory!");
+ return fs.delete(lockFile);
+ }
+}
Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Wed Dec 27 16:03:04 2006
@@ -53,7 +53,7 @@
public static void createCrawlDb(FileSystem fs, Path crawldb, List<URLCrawlDatum> init)
throws Exception {
LOG.trace("* creating crawldb: " + crawldb);
- Path dir = new Path(crawldb, CrawlDatum.DB_DIR_NAME);
+ Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000")
.toString(), Text.class, CrawlDatum.class);
Iterator<URLCrawlDatum> it = init.iterator();
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java Wed Dec 27 16:03:04 2006
@@ -125,7 +125,7 @@
private void createCrawlDb(FileSystem fs, Path crawldb, TreeSet init, CrawlDatum cd) throws Exception {
LOG.fine("* creating crawldb: " + crawldb);
- Path dir = new Path(crawldb, CrawlDatum.DB_DIR_NAME);
+ Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000").toString(), Text.class, CrawlDatum.class);
Iterator it = init.iterator();
while (it.hasNext()) {
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Wed Dec 27 16:03:04 2006
@@ -259,20 +259,14 @@
Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
myConfiguration, true);
- Path fetchlistPath = new Path(new Path(generatedSegment,
- CrawlDatum.GENERATE_DIR_NAME), "part-00000");
-
- ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
-
- // verify all got filtered out
- assertEquals(0, fetchList.size());
+ assertNull("should be null (0 entries)", generatedSegment);
generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
- fetchlistPath = new Path(new Path(generatedSegment,
+ Path fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-00000");
- fetchList = readContents(fetchlistPath);
+ ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
// verify nothing got filtered
assertEquals(list.size(), fetchList.size());
@@ -317,7 +311,7 @@
// generate segment
Generator g = new Generator(config);
Path generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
- Long.MAX_VALUE, filter);
+ Long.MAX_VALUE, filter, false);
return generatedSegment;
}
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Wed Dec 27 16:03:04 2006
@@ -105,7 +105,7 @@
}
private List<String> readCrawldb() throws IOException{
- Path dbfile=new Path(crawldbPath,CrawlDatum.DB_DIR_NAME + "/part-00000/data");
+ Path dbfile=new Path(crawldbPath,CrawlDb.CURRENT_NAME + "/part-00000/data");
System.out.println("reading:" + dbfile);
SequenceFile.Reader reader=new SequenceFile.Reader(fs, dbfile, conf);
ArrayList<String> read=new ArrayList<String>();
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?view=diff&rev=490607&r1=490606&r2=490607
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Wed Dec 27 16:03:04 2006
@@ -88,7 +88,7 @@
//generate
Generator g=new Generator(conf);
Path generatedSegment = g.generate(crawldbPath, segmentsPath, 1,
- Long.MAX_VALUE, Long.MAX_VALUE, false);
+ Long.MAX_VALUE, Long.MAX_VALUE, false, false);
long time=System.currentTimeMillis();
//fetch