You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/06/09 22:57:49 UTC
svn commit: r189818 -
/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
Author: cutting
Date: Thu Jun 9 13:57:49 2005
New Revision: 189818
URL: http://svn.apache.org/viewcvs?rev=189818&view=rev
Log:
Implement anchor truncation, as in trunk.
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java?rev=189818&r1=189817&r2=189818&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java Thu Jun 9 13:57:49 2005
@@ -35,6 +35,7 @@
public static String CURRENT_NAME = "current";
+ private int maxAnchorLength;
public LinkDb() {
super(null);
@@ -45,7 +46,9 @@
super(conf);
}
- public void configure(JobConf job) {}
+ public void configure(JobConf job) {
+ maxAnchorLength = job.getInt("db.max.anchor.length", 100);
+ }
public void map(WritableComparable key, Writable value,
OutputCollector output) throws IOException {
@@ -56,7 +59,11 @@
for (int i = 0; i < outlinks.length; i++) {
Outlink outlink = outlinks[i];
inlinks.clear();
- inlinks.add(new Inlink(fromUrl, outlink.getAnchor()));
+ String anchor = outlink.getAnchor(); // truncate long anchors
+ if (anchor.length() > maxAnchorLength) {
+ anchor = anchor.substring(0, maxAnchorLength);
+ }
+ inlinks.add(new Inlink(fromUrl, anchor)); // collect inverted link
output.collect(new UTF8(outlink.getToUrl()), inlinks);
}
}