You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/10/20 22:04:21 UTC
svn commit: r326994 - in /lucene/nutch/branches/mapred: conf/
src/java/org/apache/nutch/crawl/
Author: cutting
Date: Thu Oct 20 13:04:17 2005
New Revision: 326994
URL: http://svn.apache.org/viewcvs?rev=326994&view=rev
Log:
Use OPIC score when crawling and as boost in search, replacing the use of simple incoming link count.
Modified:
lucene/nutch/branches/mapred/conf/crawl-tool.xml
lucene/nutch/branches/mapred/conf/nutch-default.xml
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java
Modified: lucene/nutch/branches/mapred/conf/crawl-tool.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/crawl-tool.xml?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/crawl-tool.xml (original)
+++ lucene/nutch/branches/mapred/conf/crawl-tool.xml Thu Oct 20 13:04:17 2005
@@ -15,13 +15,6 @@
</property>
<property>
- <name>indexer.boost.by.link.count</name>
- <value>true</value>
- <description>When true scores for a page are multipled by the log of
- the number of incoming links to the page.</description>
-</property>
-
-<property>
<name>db.ignore.internal.links</name>
<value>false</value>
<description>If true, when adding new links to a page, links from
Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Thu Oct 20 13:04:17 2005
@@ -271,17 +271,6 @@
fetchlist. -1 if unlimited.</description>
</property>
-<!-- fetchlist tool properties -->
-
-<property>
- <name>fetchlist.score.by.link.count</name>
- <value>true</value>
- <description>If true, set page scores on fetchlist entries based on
- log(number of anchors), instead of using original page scores. This
- results in prioritization of pages with many incoming links.
- </description>
-</property>
-
<!-- fetcher properties -->
<property>
@@ -448,13 +437,6 @@
value of this parameter. This is compiled into indexes, so, when
this is changed, pages must be re-indexed for it to take
effect.</description>
-</property>
-
-<property>
- <name>indexer.boost.by.link.count</name>
- <value>true</value>
- <description>When true scores for a page are multipled by the log of
- the number of incoming links to the page.</description>
</property>
<property>
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java Thu Oct 20 13:04:17 2005
@@ -114,7 +114,7 @@
new LinkDb(conf).invert(linkDb, segments); // invert links
// index, dedup & merge
- new Indexer(conf).index(indexes, linkDb, fs.listFiles(segments));
+ new Indexer(conf).index(indexes, crawlDb, linkDb, fs.listFiles(segments));
new DeleteDuplicates(conf).dedup(new File[] { indexes });
new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir).merge();
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java Thu Oct 20 13:04:17 2005
@@ -31,7 +31,7 @@
public static final String FETCH_DIR_NAME = "crawl_fetch";
public static final String PARSE_DIR_NAME = "crawl_parse";
- private final static byte CUR_VERSION = 1;
+ private final static byte CUR_VERSION = 2;
public static final byte STATUS_DB_UNFETCHED = 1;
public static final byte STATUS_DB_FETCHED = 2;
@@ -47,15 +47,18 @@
private long fetchTime = System.currentTimeMillis();
private byte retries;
private float fetchInterval;
- private int linkCount;
+ private float score = 1.0f;
public CrawlDatum() {}
public CrawlDatum(int status, float fetchInterval) {
this.status = (byte)status;
this.fetchInterval = fetchInterval;
- if (status == STATUS_LINKED)
- linkCount = 1;
+ }
+
+ public CrawlDatum(int status, float fetchInterval, float score) {
+ this(status, fetchInterval);
+ this.score = score;
}
//
@@ -80,8 +83,8 @@
this.fetchInterval = fetchInterval;
}
- public int getLinkCount() { return linkCount; }
- public void setLinkCount(int linkCount) { this.linkCount = linkCount; }
+ public float getScore() { return score; }
+ public void setScore(float score) { this.score = score; }
//
// writable methods
@@ -96,18 +99,18 @@
public void readFields(DataInput in) throws IOException {
byte version = in.readByte(); // read version
- if (version > CUR_VERSION) // check version
+ if (version != CUR_VERSION) // check version
throw new VersionMismatchException(CUR_VERSION, version);
status = in.readByte();
fetchTime = in.readLong();
retries = in.readByte();
fetchInterval = in.readFloat();
- linkCount = in.readInt();
+ score = in.readFloat();
}
- /** The number of bytes into a CrawlDatum that the linkCount is stored. */
- private static final int LINK_COUNT_OFFSET = 1 + 1 + 8 + 1 + 4;
+ /** The number of bytes into a CrawlDatum that the score is stored. */
+ private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4;
public void write(DataOutput out) throws IOException {
out.writeByte(CUR_VERSION); // store current version
@@ -115,7 +118,7 @@
out.writeLong(fetchTime);
out.writeByte(retries);
out.writeFloat(fetchInterval);
- out.writeInt(linkCount);
+ out.writeFloat(score);
}
/** Copy the contents of another instance into this instance. */
@@ -124,7 +127,7 @@
this.fetchTime = that.fetchTime;
this.retries = that.retries;
this.fetchInterval = that.fetchInterval;
- this.linkCount = that.linkCount;
+ this.score = that.score;
}
@@ -132,11 +135,11 @@
// compare methods
//
- /** Sort by decreasing link count. */
+ /** Sort by decreasing score. */
public int compareTo(Object o) {
CrawlDatum that = (CrawlDatum)o;
- if (that.linkCount != this.linkCount)
- return that.linkCount - this.linkCount;
+ if (that.score != this.score)
+ return (that.score - this.score) > 0 ? 1 : -1;
if (that.status != this.status)
return this.status - that.status;
if (that.fetchTime != this.fetchTime)
@@ -153,10 +156,10 @@
public Comparator() { super(CrawlDatum.class); }
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- int linkCount1 = readInt(b1,s1+LINK_COUNT_OFFSET);
- int linkCount2 = readInt(b2,s2+LINK_COUNT_OFFSET);
- if (linkCount2 != linkCount1) {
- return linkCount2 - linkCount1;
+ float score1 = readFloat(b1,s1+SCORE_OFFSET);
+ float score2 = readFloat(b2,s2+SCORE_OFFSET);
+ if (score2 != score1) {
+ return (score2 - score1) > 0 ? 1 : -1;
}
int status1 = b1[s1+1];
int status2 = b2[s2+1];
@@ -194,7 +197,7 @@
buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
buf.append("Retry interval: " + getFetchInterval() + " days\n");
- buf.append("Link Count: " + getLinkCount() + "\n");
+ buf.append("Score: " + getScore() + "\n");
return buf.toString();
}
@@ -207,7 +210,7 @@
(this.fetchTime == other.fetchTime) &&
(this.retries == other.retries) &&
(this.fetchInterval == other.fetchInterval) &&
- (this.linkCount == other.linkCount);
+ (this.score == other.score);
}
public int hashCode() {
@@ -216,7 +219,7 @@
((int)fetchTime) ^
retries ^
Float.floatToIntBits(fetchInterval) ^
- linkCount;
+ Float.floatToIntBits(score);
}
public Object clone() {
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Thu Oct 20 13:04:17 2005
@@ -38,11 +38,10 @@
CrawlDatum highest = null;
CrawlDatum old = null;
- int linkCount = 0;
+ float scoreIncrement = 0.0f;
while (values.hasNext()) {
CrawlDatum datum = (CrawlDatum)values.next();
- linkCount += datum.getLinkCount(); // sum link counts
if (highest == null || datum.getStatus() > highest.getStatus()) {
highest = datum; // find highest status
@@ -52,6 +51,10 @@
case CrawlDatum.STATUS_DB_UNFETCHED:
case CrawlDatum.STATUS_DB_FETCHED:
old = datum;
+ break;
+ case CrawlDatum.STATUS_LINKED:
+ scoreIncrement += datum.getScore();
+ break;
}
}
@@ -71,6 +74,7 @@
} else {
result = highest; // use new entry
result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+ result.setScore(1.0f); // initial score is 1.0f
}
break;
@@ -99,7 +103,7 @@
}
if (result != null) {
- result.setLinkCount(linkCount);
+ result.setScore(result.getScore() + scoreIncrement);
output.collect(key, result);
}
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Thu Oct 20 13:04:17 2005
@@ -38,6 +38,7 @@
public static final String DIGEST_KEY = "nutch.content.digest";
public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+ public static final String SCORE_KEY = "nutch.crawl.score";
public static class InputFormat extends SequenceFileInputFormat {
/** Don't split inputs, to keep things polite. */
@@ -197,6 +198,8 @@
(DIGEST_KEY, MD5Hash.digest(content.getContent()).toString());
content.getMetadata().setProperty // add segment to metadata
(SEGMENT_NAME_KEY, segmentName);
+ content.getMetadata().setProperty // add score to metadata
+ (SCORE_KEY, Float.toString(datum.getScore()));
Parse parse = null;
if (parsing) {
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java Thu Oct 20 13:04:17 2005
@@ -61,7 +61,7 @@
if (crawlDatum.getFetchTime() > curTime)
return; // not time yet
- output.collect(crawlDatum, key); // invert for sort by linkCount
+ output.collect(crawlDatum, key); // invert for sort by score
}
/** Partition by host (value). */
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java Thu Oct 20 13:04:17 2005
@@ -138,11 +138,9 @@
super(conf);
}
- private boolean boostByLinkCount;
private float scorePower;
public void configure(JobConf job) {
- boostByLinkCount = job.getBoolean("indexer.boost.by.link.count", false);
scorePower = job.getFloat("indexer.score.power", 0.5f);
}
@@ -150,7 +148,8 @@
OutputCollector output, Reporter reporter)
throws IOException {
Inlinks inlinks = null;
- CrawlDatum crawlDatum = null;
+ CrawlDatum dbDatum = null;
+ CrawlDatum fetchDatum = null;
ParseData parseData = null;
ParseText parseText = null;
while (values.hasNext()) {
@@ -158,7 +157,21 @@
if (value instanceof Inlinks) {
inlinks = (Inlinks)value;
} else if (value instanceof CrawlDatum) {
- crawlDatum = (CrawlDatum)value;
+ CrawlDatum datum = (CrawlDatum)value;
+ switch (datum.getStatus()) {
+ case CrawlDatum.STATUS_DB_UNFETCHED:
+ case CrawlDatum.STATUS_DB_FETCHED:
+ case CrawlDatum.STATUS_DB_GONE:
+ dbDatum = datum;
+ break;
+ case CrawlDatum.STATUS_FETCH_SUCCESS:
+ case CrawlDatum.STATUS_FETCH_RETRY:
+ case CrawlDatum.STATUS_FETCH_GONE:
+ fetchDatum = datum;
+ break;
+ default:
+ throw new RuntimeException("Unexpected status: "+datum.getStatus());
+ }
} else if (value instanceof ParseData) {
parseData = (ParseData)value;
} else if (value instanceof ParseText) {
@@ -168,7 +181,8 @@
}
}
- if (crawlDatum == null || parseText == null || parseData == null) {
+ if (fetchDatum == null || dbDatum == null
+ || parseText == null || parseData == null) {
return; // only have inlinks
}
@@ -183,10 +197,8 @@
// add digest, used by dedup
doc.add(Field.UnIndexed("digest", meta.getProperty(Fetcher.DIGEST_KEY)));
- // compute boost
- float boost =
- IndexSegment.calculateBoost(1.0f, scorePower, boostByLinkCount,
- anchors.length);
+ // boost is opic
+ float boost = (float)Math.pow(dbDatum.getScore(), scorePower);
// apply boost to all indexed fields.
doc.setBoost(boost);
// store boost for use by explain and dedup
@@ -205,7 +217,7 @@
FetcherOutput fo =
new FetcherOutput(new FetchListEntry(true,new Page((UTF8)key),anchors),
null, null);
- fo.setFetchDate(crawlDatum.getFetchTime());
+ fo.setFetchDate(fetchDatum.getFetchTime());
// run indexing filters
doc = IndexingFilters.filter(doc,new ParseImpl(parseText, parseData),fo);
@@ -217,7 +229,7 @@
output.collect(key, new ObjectWritable(doc));
}
- public void index(File indexDir, File linkDb, File[] segments)
+ public void index(File indexDir, File crawlDb, File linkDb, File[] segments)
throws IOException {
LOG.info("Indexer: starting");
@@ -232,6 +244,7 @@
job.addInputDir(new File(segments[i], ParseText.DIR_NAME));
}
+ job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME));
job.addInputDir(new File(linkDb, LinkDb.CURRENT_NAME));
job.setInputFormat(InputFormat.class);
@@ -253,17 +266,18 @@
public static void main(String[] args) throws Exception {
Indexer indexer = new Indexer(NutchConf.get());
- if (args.length < 2) {
- System.err.println("Usage: <index> <linkdb> <segment> <segment> ...");
+ if (args.length < 4) {
+ System.err.println("Usage: <index> <crawldb> <linkdb> <segment> ...");
return;
}
- File[] segments = new File[args.length-2];
- for (int i = 2; i < args.length; i++) {
- segments[i-2] = new File(args[i]);
+ File[] segments = new File[args.length-3];
+ for (int i = 3; i < args.length; i++) {
+ segments[i-3] = new File(args[i]);
}
- indexer.index(new File(args[0]), new File(args[1]), segments);
+ indexer.index(new File(args[0]), new File(args[1]), new File(args[2]),
+ segments);
}
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java Thu Oct 20 13:04:17 2005
@@ -64,6 +64,11 @@
// collect outlinks for subsequent db update
Outlink[] links = parse.getData().getOutlinks();
+
+ // compute OPIC score contribution
+ float score = Float.valueOf(parse.getData().get(Fetcher.SCORE_KEY));
+ score /= links.length;
+
for (int i = 0; i < links.length; i++) {
String toUrl = links[i].getToUrl();
try {
@@ -75,7 +80,7 @@
if (toUrl != null)
crawlOut.append(new UTF8(toUrl),
new CrawlDatum(CrawlDatum.STATUS_LINKED,
- interval));
+ interval, score));
}
}